From 3e56718730c4ac78b35409ed8038174b8dbfa518 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 25 Oct 2022 15:23:26 -0400 Subject: [PATCH 001/162] Update DESeq Multi-Factor Analysis with Heatmap and MDS --- tools/deseq-multi-factor.cwl | 322 ++++++++++++++++------------- tools/morpheus-heatmap.cwl | 113 ++++++++++ workflows/deseq-multi-factor.cwl | 341 +++++++++++++++++++------------ 3 files changed, 500 insertions(+), 276 deletions(-) create mode 100644 tools/morpheus-heatmap.cwl diff --git a/tools/deseq-multi-factor.cwl b/tools/deseq-multi-factor.cwl index 1f96ac56..61bbfc89 100644 --- a/tools/deseq-multi-factor.cwl +++ b/tools/deseq-multi-factor.cwl @@ -8,7 +8,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/scidap-deseq:v0.0.25 + dockerPull: biowardrobe2/deseq:v0.0.3 inputs: @@ -18,7 +18,8 @@ inputs: inputBinding: prefix: "--expression" doc: | - Path to the TSV/CSV files with expression data. All files should have the following header + Path to the TSV/CSV files with expression data. + All files should have the following header: RefseqId GeneId Chrom TxStart TxEnd Strand TotalReads Rpkm expression_names: @@ -26,41 +27,51 @@ inputs: inputBinding: prefix: "--aliases" doc: | - Unique names for files provided in --expression, no special characters or spaces are allowed. - Number and order of the names should corresponds to values from --expression + Unique names for files provided in --expression, + no special characters or spaces are allowed. + Number and order of the names should corresponds + to values from --expression. metadata_file: type: File inputBinding: prefix: "--metadata" doc: | - Path to the TSV/CSV file to provide metadata for the samples from --expression. - First column should have the name 'sample', other columns may have arbitrary names. - The values from the 'sample' column should correspond to the values provided in --aliases. - For a proper --contrast intepretation, values defined in each column should not be used in others. + Path to the TSV/CSV file to provide metadata for the + samples from --expression. First column should have + the name 'sample', other columns may have arbitrary names. + The values from the 'sample' column should correspond to + the values provided in --aliases. For a proper --contrast + intepretation, values defined in each column should not be + used in other columns. All metadata columns are treated as + factors (no covariates are supported). design_formula: type: string inputBinding: prefix: "--design" doc: | - Design formula. Should start with ~ and include terms from the --metadata table + Design formula. Should start with ~ and include terms from + the --metadata table. reduced_formula: type: string? inputBinding: prefix: "--reduced" doc: | - Reduced formula to compare against with the term(s) of interest removed. - Should start with ~. Ignored when run with --wald + Reduced formula with the term(s) of interest removed. + Should start with ~. If provided, force DESeq2 to run + LRT test instead of the Wald. contrast: - type: string + type: string? inputBinding: prefix: "--contrast" doc: | - Contrast to be be applied for the output, formatted as a mathematical formula - of values from the --metadata table + Contrast to be be applied for the output, formatted as + a mathematical formula of values from the --metadata table. + If not provided, the last term from the design formula will + be used. base: type: @@ -70,10 +81,10 @@ inputs: inputBinding: prefix: "--base" doc: | - Value from each column of metadata file to be set as base levels. - Number and order of provided values should correspond the order of columns - in --metadata file. - Default: define base levels alphabetically for each columns of metadata table + Value(s) from each metadata file column(s) to be set as + the base level(s). Number and order of provided values should + correspond the order of columns in --metadata file. Default: + define base levels alphabetically for each metadata column. feature_type: type: @@ -83,93 +94,127 @@ inputs: - "gene" - "transcript" inputBinding: - prefix: "--ftype" + prefix: "--type" doc: | Feature type to use for differential expression. If set to 'gene', use 'GeneId' column from the provided in --expression files. If set to 'transcript', use 'RefseqId' from the provided in --expression files. Default: gene - minimum_counts: - type: int? + excluded_features: + type: + - "null" + - string + - string[] inputBinding: - prefix: "--mincounts" + prefix: "--exclude" doc: | - Keep only those features where the total number of counts for all samples - is bigger than this value. - Default: 0 + Features to be excluded from the differential expression analysis. + Default: include all features - use_wald: - type: boolean? + normalization_method: + type: + - "null" + - type: enum + symbols: + - "vst" + - "rlog" inputBinding: - prefix: "--wald" + prefix: "--norm" doc: | - Use pair-wise Wald test instead of LRT. --reduced parameter will be ignored - Default: use LRT test + Read counts normalization for the exploratory visualization analysis. + Use 'vst' for medium-to-large datasets (n > 30) and 'rlog' for + small datasets (n < 30), when there is a wide range of sequencing + depth across samples. + Default: vst - splitby: + remove: type: string? inputBinding: - prefix: "--splitby" + prefix: "--remove" doc: | - Used only in plots. Column from the metadata file to split samples into categories. - Default: the first after the 'sample' column from the metadata file + Column from the metadata file to remove batch effect when + exporting feature counts. All components that include this + term will be removed from the design formula when correcting + for batch effect. Default: do not remove batch effect from + the exported counts - groupby: - type: string? + cluster_method: + type: + - "null" + - type: enum + symbols: + - "row" + - "column" + - "both" inputBinding: - prefix: "--groupby" + prefix: "--cluster" doc: | - Used only in plots. Column from the metadata file to combine samples into groups. - Default: the last column from the metadata file + Hopach clustering method to be run on normalized read counts for the + exploratory visualization analysis. Default: do not run clustering - selected_features: + row_distance: type: - "null" - - string - - string[] + - type: enum + symbols: + - "cosangle" + - "abscosangle" + - "euclid" + - "abseuclid" + - "cor" + - "abscor" inputBinding: - prefix: "--features" + prefix: "--rowdist" doc: | - Used only in plots. Features of interest to label on the generated plots. - Default: --topn N features with the highest and the lowest log2 fold change - expression values. + Distance metric for HOPACH row clustering. Ignored if --cluster is not + provided. Default: cosangle - excluded_features: + column_distance: type: - "null" - - string - - string[] + - type: enum + symbols: + - "cosangle" + - "abscosangle" + - "euclid" + - "abseuclid" + - "cor" + - "abscor" inputBinding: - prefix: "--exfeatures" + prefix: "--columndist" doc: | - Used only in plots. Features to be excluded from the differential expression analysis. - Default: include all features + Distance metric for HOPACH column clustering. Ignored if --cluster is not + provided. Default: euclid - topn_features: - type: int? + center_row: + type: boolean? inputBinding: - prefix: "--topn" + prefix: "--center" doc: | - Used only in plots. Show N features with the highest and N features with the lowest log2 fold - change expression values. Ignored with --features. - Default: 10 + Apply mean centering for feature expression prior to running + clustering by row. Ignored when --cluster is not row or both. + Default: do not centered - maximum_padj: - type: float? + selected_features: + type: + - "null" + - string + - string[] inputBinding: - prefix: "--padj" + prefix: "--label" doc: | - Used only in plots. Output only features with adjusted P-value not bigger than this treshold. - Default: 0.05 + Features of interest to label on the generated volcanot plot. Default: + top 10 features with the highest and the lowest log2 fold change + expression values. - use_pvalue: - type: boolean? + maximum_padj: + type: float? inputBinding: - prefix: "--usepvalue" + prefix: "--padj" doc: | - Used only in plots. Treat --padj as a theshold for P-value - Default: --padj defines the treshold for adjusted P-value + In the exploratory visualization analysis output only features with + adjusted P-value not bigger than this value. Default: 0.05 export_pdf_plots: type: boolean? @@ -182,7 +227,6 @@ inputs: output_prefix: type: string? inputBinding: - position: 9 prefix: "--output" doc: | Output prefix for generated files @@ -190,10 +234,9 @@ inputs: threads: type: int? inputBinding: - position: 10 - prefix: "--threads" + prefix: "--cpus" doc: | - Threads number + Number of cores/cpus to use. Default: 1 outputs: @@ -205,6 +248,13 @@ outputs: doc: | TSV file with not filtered differentially expressed features + read_counts_gct: + type: File + outputBinding: + glob: "*_norm_read_counts.gct" + doc: | + GCT file with normalized, optionally batch corrected, read counts + volcano_plot_png: type: File? outputBinding: @@ -226,7 +276,7 @@ outputs: outputBinding: glob: "*_pca_plot.png" doc: | - PCA plot of rlog-normalized counts based on the top 500 + PCA plot of normalized counts based on the top 500 features selected by the highest row variance PNG format @@ -235,25 +285,18 @@ outputs: outputBinding: glob: "*_pca_plot.pdf" doc: | - PCA plot of rlog-normalized counts based on the top 500 + PCA plot of normalized counts based on the top 500 features selected by the highest row variance PDF format - counts_plot_png: - type: File? - outputBinding: - glob: "*_counts_plot.png" - doc: | - rlog-normalized counts plots - PNG format - - counts_plot_pdf: + mds_plot_html: type: File? outputBinding: - glob: "*_counts_plot.pdf" + glob: "*_mds_plot.html" doc: | - rlog-normalized counts plots - PDF format + MDS plot of normalized counts. Optionally batch corrected + based on the --remove value. + HTML format stdout_log: type: stdout @@ -263,8 +306,8 @@ outputs: baseCommand: [run_deseq_manual.R] -stdout: deseq_multi_factor_stdout.log -stderr: deseq_multi_factor_stderr.log +stdout: deseq_manual_factor_stdout.log +stderr: deseq_manual_factor_stderr.log $namespaces: @@ -279,7 +322,7 @@ label: "DESeq2 Multi-factor Analysis" s:alternateName: "Runs DeSeq2 multi-factor analysis with manual control over major parameters" -s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/deseq-lrt.cwl +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/deseq-multi-factor.cwl s:codeRepository: https://github.com/Barski-lab/workflows s:license: http://www.apache.org/licenses/LICENSE-2.0 @@ -315,7 +358,7 @@ s:creator: doc: | DESeq2 Multi-factor Analysis - ============================ + Runs DeSeq2 multi-factor analysis with manual control over major parameters @@ -323,15 +366,14 @@ s:about: | usage: run_deseq_manual.R [-h] --expression EXPRESSION [EXPRESSION ...] --aliases ALIASES [ALIASES ...] --metadata METADATA --design DESIGN [--reduced REDUCED] - --contrast CONTRAST [--base [BASE ...]] [--ftype {gene,transcript}] - [--mincounts MINCOUNTS] [--wald] [--splitby SPLITBY] - [--groupby GROUPBY] [--features [FEATURES ...]] - [--exfeatures [EXFEATURES ...]] [--topn TOPN] [--padj PADJ] - [--usepvalue] [--pdf] [--output OUTPUT] [--threads THREADS] + [--contrast CONTRAST] [--base [BASE ...]] [--type {gene,transcript}] + [--exclude [EXCLUDE ...]] [--norm {vst,rlog}] [--remove REMOVE] + [--cluster {row,column,both}] [--center] [--label [LABEL ...]] + [--padj PADJ] [--pdf] [--output OUTPUT] [--cpus CPUS] - Run DeSeq2 with manual control over major parameters + DESeq2 Multi-factor Analysis - optional arguments: + options: -h, --help show this help message and exit --expression EXPRESSION [EXPRESSION ...] Path to the TSV/CSV files with expression data. All @@ -341,63 +383,63 @@ s:about: | Unique names for files provided in --expression, no special characters or spaces are allowed. Number and order of the names should corresponds to values from - --expression + --expression. --metadata METADATA Path to the TSV/CSV file to provide metadata for the samples from --expression. First column should have the name 'sample', other columns may have arbitrary names. The values from the 'sample' column should correspond to the values provided in --aliases. For a proper --contrast intepretation, values defined in - each column should not be used in others. + each column should not be used in other columns. All + metadata columns are treated as factors (no covariates + are supported). --design DESIGN Design formula. Should start with ~ and include terms - from the --metadata table - --reduced REDUCED Reduced formula to compare against with the term(s) of - interest removed. Should start with ~. Ignored when - run with --wald + from the --metadata table. + --reduced REDUCED Reduced formula with the term(s) of interest removed. + Should start with ~. If provided, force DESeq2 to run + LRT test instead of the Wald. --contrast CONTRAST Contrast to be be applied for the output, formatted as a mathematical formula of values from the --metadata - table - --base [BASE ...] Value from each column of metadata file to be set as - base levels. Number and order of provided values - should correspond the order of columns in --metadata - file. Default: define base levels alphabetically for - each columns of metadata table - --ftype {gene,transcript} + table. If not provided, the last term from the design + formula will be used. + --base [BASE ...] Value(s) from each metadata file column(s) to be set + as the base level(s). Number and order of provided + values should correspond the order of columns in + --metadata file. Default: define base levels + alphabetically for each metadata column. + --type {gene,transcript} Feature type to use for differential expression. If set to 'gene', use 'GeneId' column from the provided in --expression files. If set to 'transcript', use 'RefseqId' from the provided in --expression files. Default: gene - --mincounts MINCOUNTS - Keep only those features where the total number of - counts for all samples is bigger than this value. - Default: 0 - --wald Use pair-wise Wald test instead of LRT. --reduced - parameter will be ignored Default: use LRT test - --splitby SPLITBY Used only in plots. Column from the metadata file to - split samples into categories. Default: the first - after the 'sample' column from the metadata file - --groupby GROUPBY Used only in plots. Column from the metadata file to - combine samples into groups. Default: the last column - from the metadata file - --features [FEATURES ...] - Used only in plots. Features of interest to label on - the generated plots. Default: --topn N features with - the highest and the lowest log2 fold change expression + --exclude [EXCLUDE ...] + Features to be excluded from the differential + expression analysis. Default: include all features + --norm {vst,rlog} Read counts normalization for the exploratory + visualization analysis. Use 'vst' for medium-to-large + datasets (n > 30) and 'rlog' for small datasets (n < + 30), when there is a wide range of sequencing depth + across samples. Default: vst + --remove REMOVE Column from the metadata file to remove batch effect + when exporting feature counts. All components that + include this term will be removed from the design + formula when correcting for batch effect. Default: do + not remove batch effect from the exported counts + --cluster {row,column,both} + Hopach clustering method to be run on normalized read + counts for the exploratory visualization analysis. + Default: do not run clustering + --center Apply mean centering for feature expression prior to + running clustering by row. Ignored when --cluster is + not row or both. Default: do not centered + --label [LABEL ...] Features of interest to label on the generated + volcanot plot. Default: top 10 features with the + highest and the lowest log2 fold change expression values. - --exfeatures [EXFEATURES ...] - Used only in plots. Features to be excluded from the - differential expression analysis. Default: include all - features - --topn TOPN Used only in plots. Show N features with the highest - and N features with the lowest log2 fold change - expression values. Ignored with --features. Default: - 10 - --padj PADJ Used only in plots. Output only features with adjusted - P-value not bigger than this treshold. Default: 0.05 - --usepvalue Used only in plots. Treat --padj as a theshold for - P-value Default: --padj defines the treshold for - adjusted P-value + --padj PADJ In the exploratory visualization analysis output only + features with adjusted P-value not bigger than this + value. Default: 0.05 --pdf Export plots in PDF. Default: false --output OUTPUT Output prefix for generated files - --threads THREADS Threads number \ No newline at end of file + --cpus CPUS Number of cores/cpus to use. Default: 1 \ No newline at end of file diff --git a/tools/morpheus-heatmap.cwl b/tools/morpheus-heatmap.cwl new file mode 100644 index 00000000..3a4c8831 --- /dev/null +++ b/tools/morpheus-heatmap.cwl @@ -0,0 +1,113 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/deseq:v0.0.2 + + +inputs: + + read_counts_gct: + type: File + inputBinding: + prefix: "--gct" + doc: | + Path to the input GCT file. + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix for generated files + + +outputs: + + heatmap_html: + type: File + outputBinding: + glob: "*.html" + doc: | + Morpheus heatmap in HTML format + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: [run_morpheus.R] +stdout: morpheus_stdout.log +stderr: morpheus_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +s:name: "Morpheus Heatmap" +label: "Morpheus Heatmap" +s:alternateName: "Generates Morpheus heatmap from input GCT file" + + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/morpheus-heatmap.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + +doc: | + Morpheus Heatmap + + Generates Morpheus heatmap from input GCT file + + +s:about: | + usage: run_morpheus.R + [-h] --gct GCT [--cluster {row,column,both}] [--output OUTPUT] + + Morpheus heatmap from GCT file + + options: + -h, --help show this help message and exit + --gct GCT Path to the input GCT file. + --output OUTPUT Output prefix for generated files \ No newline at end of file diff --git a/workflows/deseq-multi-factor.cwl b/workflows/deseq-multi-factor.cwl index ed8b92cf..d9bf46e8 100644 --- a/workflows/deseq-multi-factor.cwl +++ b/workflows/deseq-multi-factor.cwl @@ -43,8 +43,8 @@ inputs: - File[] label: "RNA-Seq experiments" doc: | - TSV/CSV files with expression data grouped by isoforms. - The following header is required: + Path to the TSV/CSV files with expression data. + All files should have the following header: RefseqId GeneId Chrom TxStart TxEnd Strand TotalReads Rpkm 'sd:upstreamSource': "rnaseq_experiment/rpkm_isoforms" 'sd:localLabel': true @@ -55,8 +55,8 @@ inputs: - File[] label: "RNA-Seq experiments" doc: | - TSV/CSV files with expression data grouped by genes. - The following header is required: + Path to the TSV/CSV files with expression data. + All files should have the following header: RefseqId GeneId Chrom TxStart TxEnd Strand TotalReads Rpkm 'sd:upstreamSource': "rnaseq_experiment/rpkm_genes" 'sd:localLabel': true @@ -65,151 +65,201 @@ inputs: type: string[] label: "RNA-Seq experiments" doc: | - Unique names for files provided in 'isoforms_expression_files' or - 'genes_expression_files' inputs. No special characters or spaces are allowed. - Number and order of the names should corresponds to order of files. + Unique names for files provided in --expression, + no special characters or spaces are allowed. + Number and order of the names should corresponds + to values from --expression. 'sd:upstreamSource': "rnaseq_experiment/alias" feature_type: type: - "null" - type: enum - symbols: ["transcript", "gene"] + symbols: + - "transcript" + - "gene" default: "gene" label: "Feature type to use for differential expression" doc: | Feature type to use for differential expression. - If set to 'gene', the 'GeneId' column will be renamed to 'feature' and used - for differential expression. If set to 'transcript', the 'RefseqId' column - will be used instead. + If set to 'gene', use 'GeneId' column from the provided in --expression files. + If set to 'transcript', use 'RefseqId' from the provided in --expression files. + Default: gene design_formula: type: string label: "Design formula" doc: | - Design formula should start with ~ and include terms from the 'metadata_file' + Design formula. Should start with ~ and include terms from + the --metadata table. reduced_formula: type: string? - label: "Reduced formula" + label: "Reduced formula. If provided, use LRT instead of Wald." doc: | - Reduced formula to compare against with the term(s) of interest removed. - Should start with ~. Ignored when 'use_wald' is set to true. + Reduced formula with the term(s) of interest removed. + Should start with ~. If provided, force DESeq2 to run + LRT test instead of the Wald. - use_wald: - type: boolean? - label: "Use pair-wise Wald test instead of LRT" + contrast: + type: string? + label: "Contrast. If not provided, use the last term from the design formula." doc: | - Use pair-wise Wald test instead of LRT. 'reduced_formula' parameter will be ignored - Default: use LRT test + Contrast to be be applied for the output, formatted as + a mathematical formula of values from the --metadata table. + If not provided, the last term from the design formula will + be used. - contrast: - type: string - label: "Contrast to be be applied for the output" + remove: + type: string? + label: "Column from the metadata file to remove batch effect when exporting feature counts" doc: | - Contrast to be be applied for the output, formatted as a mathematical formula - of values from the 'metadata_file' + Column from the metadata file to remove batch effect when + exporting feature counts. All components that include this + term will be removed from the design formula when correcting + for batch effect. Default: do not remove batch effect from + the exported counts base: type: string? - label: "Values from each column of the metadata file to be set as base levels" + label: "Values from each column of the metadata file to be set as base levels. Order matters." doc: | - Value from each column of 'metadata_file' to be set as base levels. - Number and order of provided values should correspond the order of columns - in the 'metadata_file'. - Default: define base levels alphabetically for each columns of 'metadata_file' + Value(s) from each metadata file column(s) to be set as + the base level(s). Number and order of provided values should + correspond the order of columns in --metadata file. Default: + define base levels alphabetically for each metadata column. metadata_file: type: File label: "Metadata file to assign categories to datasets" doc: | - TSV/CSV file to provide metadata for the samples from 'isoforms_expression_files' - or 'genes_expression_files' inputs. First column should have the name 'sample', - other columns may have arbitrary names. The values from the 'sample' column should - correspond to the values provided in 'expression_names' input. For a proper 'contrast' - intepretation, values defined in each column should not be used in others. + Path to the TSV/CSV file to provide metadata for the + samples from --expression. First column should have + the name 'sample', other columns may have arbitrary names. + The values from the 'sample' column should correspond to + the values provided in --aliases. For a proper --contrast + intepretation, values defined in each column should not be + used in other columns. All metadata columns are treated as + factors (no covariates are supported). + + normalization_method: + type: + - "null" + - type: enum + symbols: + - "vst" + - "rlog" + default: "vst" + label: "Read counts normalization for the exploratory visualization analysis" + doc: | + Read counts normalization for the exploratory visualization analysis. + Use 'vst' for medium-to-large datasets (n > 30) and 'rlog' for + small datasets (n < 30), when there is a wide range of sequencing + depth across samples. + Default: vst + 'sd:layout': + advanced: true - minimum_counts: - type: int? - default: 0 - label: "Minimum number of counts among all samples for feature to be included in the analysis" + center_row: + type: boolean? + default: false + label: "Apply mean centering for feature expression prior to running clustering by row" doc: | - Keep only those features where the total number of counts for all samples - is bigger than this value. + Apply mean centering for feature expression prior to running + clustering by row. Ignored when --cluster is not row or both. + Default: do not centered 'sd:layout': advanced: true - splitby: - type: string? - label: "Column from the metadata file to split samples into categories (plots only)" + cluster_method: + type: + - "null" + - type: enum + symbols: + - "row" + - "column" + - "both" + - "none" + default: "none" + label: "Hopach clustering method to be run on normalized read counts" doc: | - Used only in plots. Column from the metadata file to split samples into categories. - Default: the first after the 'sample' column from the metadata file + Hopach clustering method to be run on normalized read counts for the + exploratory visualization analysis. Default: do not run clustering 'sd:layout': advanced: true - groupby: - type: string? - label: "Column from the metadata file to combine samples into groups (plots only)" + row_distance: + type: + - "null" + - type: enum + symbols: + - "cosangle" + - "abscosangle" + - "euclid" + - "abseuclid" + - "cor" + - "abscor" + default: "cosangle" + label: "Distance metric for HOPACH row clustering" doc: | - Used only in plots. Column from the metadata file to combine samples into groups. - Default: the last column from the metadata file + Distance metric for HOPACH row clustering. Ignored if --cluster is not + provided. Default: cosangle + 'sd:layout': + advanced: true + + column_distance: + type: + - "null" + - type: enum + symbols: + - "cosangle" + - "abscosangle" + - "euclid" + - "abseuclid" + - "cor" + - "abscor" + default: "euclid" + label: "Distance metric for HOPACH column clustering" + doc: | + Distance metric for HOPACH column clustering. Ignored if --cluster is not + provided. Default: euclid 'sd:layout': advanced: true selected_features: type: string? - label: "Features of interest to label on the generated plots (plots only)" + label: "Features of interest to label on the volcano plot" doc: | - Used only in plots. Features of interest to label on the generated plots. - Default: 'topn_features' features with the highest and the lowest log2 fold change + Features of interest to label on the generated volcanot plot. Default: + top 10 features with the highest and the lowest log2 fold change expression values. 'sd:layout': advanced: true excluded_features: type: string? - label: "Features to be excluded from the differential expression analysis (plots only)" + label: "Features to be excluded from the differential expression analysis" doc: | - Used only in plots. Features to be excluded from the differential expression analysis. + Features to be excluded from the differential expression analysis. Default: include all features 'sd:layout': advanced: true - topn_features: - type: int? - default: 10 - label: "Top 2 x N features with the highest absolute log2 fold change values (plots only)" - doc: | - Used only in plots. Show N features with the highest and N features with the lowest log2 fold - change expression values. Ignored with 'selected_features'. - Default: 10 - 'sd:layout': - advanced: true - maximum_padj: type: float? default: 0.05 - label: "Use only features with the adjusted P-value not bigger than this treshold (plots only)" + label: "Maximum P-adjusted to show features in the exploratory visualization analysis" doc: | - Used only in plots. Output only features with adjusted P-value not bigger than this treshold. - 'sd:layout': - advanced: true - - use_pvalue: - type: boolean? - label: "Treat 'maximum_padj' as a theshold for P-value (plots only)" - doc: | - Used only in plots. Treat --padj as a theshold for P-value - Default: 'maximum_padj' defines the treshold for adjusted P-value + In the exploratory visualization analysis output only features with + adjusted P-value not bigger than this value. Default: 0.05 'sd:layout': advanced: true threads: type: int? - label: "Number of threads" - doc: "Number of threads for those steps that support multithreading" default: 1 + label: "Number of cores/cpus to use" + doc: "Number of cores/cpus to use. Default: 1" 'sd:layout': advanced: true @@ -224,9 +274,29 @@ outputs: TSV file with not filtered differentially expressed features 'sd:visualPlugins': - syncfusiongrid: - tab: 'Differential Expression Analysis' + tab: 'DE features' Title: 'Differentially expressed features' + read_counts_gct: + type: File + outputSource: deseq_multi_factor/read_counts_gct + label: "GCT file with normalized, optionally batch corrected, read counts" + doc: | + GCT file with normalized, optionally batch corrected, read counts + + mds_plot_html: + type: File? + outputSource: deseq_multi_factor/mds_plot_html + label: "MDS plot of normalized counts" + doc: | + MDS plot of normalized counts. Optionally batch corrected + based on the --remove value. + HTML format + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + volcano_plot_png: type: File? outputSource: deseq_multi_factor/volcano_plot_png @@ -239,55 +309,29 @@ outputs: tab: 'Plots' Caption: 'Volcano plot of differentially expressed features' - volcano_plot_pdf: - type: File? - outputSource: deseq_multi_factor/volcano_plot_pdf - label: "Volcano plot of differentially expressed features" - doc: | - Volcano plot of differentially expressed features. - PDF format - pca_plot_png: type: File? outputSource: deseq_multi_factor/pca_plot_png - label: "PCA plot of rlog-normalized counts based on the top 500 features with the highest row variance" + label: "PCA plot of normalized counts based on the top 500 features with the highest row variance" doc: | - PCA plot of rlog-normalized counts based on the top 500 + PCA plot of normalized counts based on the top 500 features selected by the highest row variance PNG format 'sd:visualPlugins': - image: tab: 'Plots' - Caption: 'PCA plot of rlog-normalized counts based on the top 500 features with the highest row variance' + Caption: 'PCA plot of normalized counts based on the top 500 features with the highest row variance' - pca_plot_pdf: - type: File? - outputSource: deseq_multi_factor/pca_plot_pdf - label: "PCA plot of rlog-normalized counts based on the top 500 features with the highest row variance" - doc: | - PCA plot of rlog-normalized counts based on the top 500 - features selected by the highest row variance - PDF format - - counts_plot_png: - type: File? - outputSource: deseq_multi_factor/counts_plot_png - label: "rlog-normalized counts plots" + heatmap_html: + type: File + outputSource: morpheus_heatmap/heatmap_html + label: "Heatmap of normalized counts" doc: | - rlog-normalized counts plots - PNG format + Morpheus heatmap in HTML format 'sd:visualPlugins': - - image: - tab: 'Plots' - Caption: 'rlog-normalized counts plots' - - counts_plot_pdf: - type: File? - outputSource: deseq_multi_factor/counts_plot_pdf - label: "rlog-normalized counts plots" - doc: | - rlog-normalized counts plots - PDF format + - linkList: + tab: 'Overview' + target: "_blank" deseq_stdout_log: type: File @@ -301,6 +345,18 @@ outputs: label: "DESeq stderr log" doc: "DESeq stderr log" + morpheus_stdout_log: + type: File + outputSource: morpheus_heatmap/stdout_log + label: "Morpheus heatmap stdout log" + doc: "Morpheus heatmap stdout log" + + morpheus_stderr_log: + type: File + outputSource: morpheus_heatmap/stderr_log + label: "Morpheus heatmap stderr log" + doc: "Morpheus heatmap stderr log" + steps: @@ -320,36 +376,49 @@ steps: expression_names: expression_names metadata_file: metadata_file design_formula: design_formula - reduced_formula: reduced_formula - contrast: contrast + reduced_formula: + source: reduced_formula + valueFrom: $(self==""?null:self) # safety measure + contrast: + source: contrast + valueFrom: $(self==""?null:self) # safety measure base: source: base valueFrom: $(split_by_common_delim(self)) feature_type: feature_type - minimum_counts: minimum_counts - use_wald: use_wald - splitby: splitby - groupby: groupby - selected_features: - source: selected_features - valueFrom: $(split_by_common_delim(self)) excluded_features: source: excluded_features valueFrom: $(split_by_common_delim(self)) - topn_features: topn_features + normalization_method: normalization_method + remove: + source: remove + valueFrom: $(self==""?null:self) # safety measure + cluster_method: + source: cluster_method + valueFrom: $(self=="none"?null:self) + row_distance: row_distance + column_distance: column_distance + center_row: center_row + selected_features: + source: selected_features + valueFrom: $(split_by_common_delim(self)) maximum_padj: maximum_padj - use_pvalue: use_pvalue - export_pdf_plots: - default: true threads: threads out: - diff_expr_features + - read_counts_gct - volcano_plot_png - - volcano_plot_pdf - pca_plot_png - - pca_plot_pdf - - counts_plot_png - - counts_plot_pdf + - mds_plot_html + - stdout_log + - stderr_log + + morpheus_heatmap: + run: ../tools/morpheus-heatmap.cwl + in: + read_counts_gct: deseq_multi_factor/read_counts_gct + out: + - heatmap_html - stdout_log - stderr_log @@ -401,5 +470,5 @@ s:creator: doc: | DESeq2 Multi-factor Analysis - ============================ + Runs DeSeq2 multi-factor analysis with manual control over major parameters \ No newline at end of file From e4e8e17a5f245a0f3b5e0069cb8b144e2fba482b Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 7 Nov 2022 12:26:12 -0500 Subject: [PATCH 002/162] Update dockerfile in the morpheus heatmap tool --- tools/morpheus-heatmap.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/morpheus-heatmap.cwl b/tools/morpheus-heatmap.cwl index 3a4c8831..a2e8f8ff 100644 --- a/tools/morpheus-heatmap.cwl +++ b/tools/morpheus-heatmap.cwl @@ -8,7 +8,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/deseq:v0.0.2 + dockerPull: biowardrobe2/morpheus:v0.0.1 inputs: From 723fa52f40646edbb02bea9a6646e66121e0ef84 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 14 Nov 2022 15:30:11 -0500 Subject: [PATCH 003/162] Update standard DESeq pipeline with MDS plot and heatmap --- tests/conformance_tests.yaml | 84 ++++++++++++++++++ tools/deseq-advanced.cwl | 131 ++++++++++++++++++++++++---- tools/deseq-lrt.cwl | 2 +- tools/group-isoforms.cwl | 2 +- tools/volcanot-plot.cwl | 95 ++++++++++++++++++++ workflows/deseq.cwl | 162 +++++++++++++++++++++++++++++++++++ 6 files changed, 457 insertions(+), 19 deletions(-) create mode 100644 tools/volcanot-plot.cwl diff --git a/tests/conformance_tests.yaml b/tests/conformance_tests.yaml index ab2c77ee..80d09c13 100644 --- a/tests/conformance_tests.yaml +++ b/tests/conformance_tests.yaml @@ -1383,6 +1383,27 @@ phenotypes_file: location: deseq_phenotypes.cls class: File + mds_plot_html: + location: deseq_mds_plot.html + class: File + volcano_plot_html_file: + location: index.html + class: File + volcano_plot_css_file: + location: index.css + class: File + volcano_plot_js_file: + location: index.js + class: File + heatmap_html: + location: heatmap.html + class: File + morpheus_stdout_log: + location: morpheus_stdout.log + class: File + morpheus_stderr_log: + location: morpheus_stderr.log + class: File gene_expr_heatmap: location: deseq_expression_heatmap.png class: File @@ -1420,6 +1441,27 @@ phenotypes_file: location: deseq_phenotypes.cls class: File + mds_plot_html: + location: deseq_mds_plot.html + class: File + volcano_plot_html_file: + location: index.html + class: File + volcano_plot_css_file: + location: index.css + class: File + volcano_plot_js_file: + location: index.js + class: File + heatmap_html: + location: heatmap.html + class: File + morpheus_stdout_log: + location: morpheus_stdout.log + class: File + morpheus_stderr_log: + location: morpheus_stderr.log + class: File gene_expr_heatmap: location: deseq_expression_heatmap.png class: File @@ -1457,6 +1499,27 @@ phenotypes_file: location: deseq_phenotypes.cls class: File + mds_plot_html: + location: deseq_mds_plot.html + class: File + volcano_plot_html_file: + location: index.html + class: File + volcano_plot_css_file: + location: index.css + class: File + volcano_plot_js_file: + location: index.js + class: File + heatmap_html: + location: heatmap.html + class: File + morpheus_stdout_log: + location: morpheus_stdout.log + class: File + morpheus_stderr_log: + location: morpheus_stderr.log + class: File gene_expr_heatmap: location: deseq_expression_heatmap.png class: File @@ -1494,6 +1557,27 @@ phenotypes_file: location: deseq_phenotypes.cls class: File + mds_plot_html: + location: deseq_mds_plot.html + class: File + volcano_plot_html_file: + location: index.html + class: File + volcano_plot_css_file: + location: index.css + class: File + volcano_plot_js_file: + location: index.js + class: File + heatmap_html: + location: heatmap.html + class: File + morpheus_stdout_log: + location: morpheus_stdout.log + class: File + morpheus_stderr_log: + location: morpheus_stderr.log + class: File gene_expr_heatmap: location: deseq_expression_heatmap.png class: File diff --git a/tools/deseq-advanced.cwl b/tools/deseq-advanced.cwl index f05cebbf..91815ae0 100644 --- a/tools/deseq-advanced.cwl +++ b/tools/deseq-advanced.cwl @@ -8,7 +8,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/scidap-deseq:v0.0.24 + dockerPull: biowardrobe2/scidap-deseq:v0.0.26 inputs: @@ -72,6 +72,72 @@ inputs: doc: | Minimum threshold for rpkm filtering. Default: 5 + cluster_method: + type: + - "null" + - type: enum + symbols: + - "row" + - "column" + - "both" + inputBinding: + prefix: "--cluster" + doc: | + Hopach clustering method to be run on normalized read counts for the + exploratory visualization part of the analysis. Default: do not run + clustering + + row_distance: + type: + - "null" + - type: enum + symbols: + - "cosangle" + - "abscosangle" + - "euclid" + - "abseuclid" + - "cor" + - "abscor" + inputBinding: + prefix: "--rowdist" + doc: | + Distance metric for HOPACH row clustering. Ignored if --cluster is not + provided. Default: cosangle + + column_distance: + type: + - "null" + - type: enum + symbols: + - "cosangle" + - "abscosangle" + - "euclid" + - "abseuclid" + - "cor" + - "abscor" + inputBinding: + prefix: "--columndist" + doc: | + Distance metric for HOPACH column clustering. Ignored if --cluster is not + provided. Default: euclid + + center_row: + type: boolean? + inputBinding: + prefix: "--center" + doc: | + Apply mean centering for feature expression prior to running + clustering by row. Ignored when --cluster is not row or both. + Default: do not centered + + maximum_padj: + type: float? + inputBinding: + prefix: "--padj" + doc: | + In the exploratory visualization part of the analysis output only features + with adjusted P-value not bigger than this value. Default: 0.05 + batch_file: type: File? inputBinding: @@ -143,6 +209,15 @@ outputs: outputBinding: glob: "*_pca_plot.pdf" + mds_plot_html: + type: File? + outputBinding: + glob: "*_mds_plot.html" + doc: | + MDS plot of normalized counts. Optionally batch corrected + based on the --remove value. + HTML format + stdout_log: type: stdout @@ -220,31 +295,37 @@ doc: | s:about: | - usage: run_deseq.R + usage: /Users/kot4or/workspaces/cwl_ws/workflows/tools/dockerfiles/scripts/run_deseq.R [-h] -u UNTREATED [UNTREATED ...] -t TREATED [TREATED ...] - [-ua [UALIAS [UALIAS ...]]] [-ta [TALIAS [TALIAS ...]]] [-un UNAME] - [-tn TNAME] [-bf BATCHFILE] [-cu CUTOFF] [-o OUTPUT] [-d DIGITS] - [-p THREADS] + [-ua [UALIAS ...]] [-ta [TALIAS ...]] [-un UNAME] [-tn TNAME] + [-bf BATCHFILE] [-cu CUTOFF] [--padj PADJ] + [--cluster {row,column,both}] + [--rowdist {cosangle,abscosangle,euclid,abseuclid,cor,abscor}] + [--columndist {cosangle,abscosangle,euclid,abseuclid,cor,abscor}] + [--center] [-o OUTPUT] [-d DIGITS] [-p THREADS] - Run BioWardrobe DESeq/DESeq2 for untreated-vs-treated groups + Run BioWardrobe DESeq/DESeq2 for untreated-vs-treated groups (condition-1-vs- + condition-2) - optional arguments: + options: -h, --help show this help message and exit -u UNTREATED [UNTREATED ...], --untreated UNTREATED [UNTREATED ...] - Untreated CSV/TSV isoforms expression files + Untreated (condition 1) CSV/TSV isoforms expression + files -t TREATED [TREATED ...], --treated TREATED [TREATED ...] - Treated CSV/TSV isoforms expression files - -ua [UALIAS [UALIAS ...]], --ualias [UALIAS [UALIAS ...]] - Unique aliases for untreated expression files. - Default: basenames of -u without extensions - -ta [TALIAS [TALIAS ...]], --talias [TALIAS [TALIAS ...]] - Unique aliases for treated expression files. Default: - basenames of -t without extensions + Treated (condition 2) CSV/TSV isoforms expression + files + -ua [UALIAS ...], --ualias [UALIAS ...] + Unique aliases for untreated (condition 1) expression + files. Default: basenames of -u without extensions + -ta [TALIAS ...], --talias [TALIAS ...] + Unique aliases for treated (condition 2) expression + files. Default: basenames of -t without extensions -un UNAME, --uname UNAME - Name for untreated condition, use only letters and + Name for untreated (condition 1), use only letters and numbers -tn TNAME, --tname TNAME - Name for treated condition, use only letters and + Name for treated (condition 2), use only letters and numbers -bf BATCHFILE, --batchfile BATCHFILE Metadata file for multi-factor analysis. Headerless @@ -253,6 +334,22 @@ s:about: | None -cu CUTOFF, --cutoff CUTOFF Minimum threshold for rpkm filtering. Default: 5 + --padj PADJ In the exploratory visualization part of the analysis + output only features with adjusted P-value not bigger + than this value. Default: 0.05 + --cluster {row,column,both} + Hopach clustering method to be run on normalized read + counts for the exploratory visualization part of the + analysis. Default: do not run clustering + --rowdist {cosangle,abscosangle,euclid,abseuclid,cor,abscor} + Distance metric for HOPACH row clustering. Ignored if + --cluster is not provided. Default: cosangle + --columndist {cosangle,abscosangle,euclid,abseuclid,cor,abscor} + Distance metric for HOPACH column clustering. Ignored + if --cluster is not provided. Default: euclid + --center Apply mean centering for feature expression prior to + running clustering by row. Ignored when --cluster is + not row or both. Default: do not centered -o OUTPUT, --output OUTPUT Output prefix. Default: deseq -d DIGITS, --digits DIGITS diff --git a/tools/deseq-lrt.cwl b/tools/deseq-lrt.cwl index 32f6a06b..801c9e7b 100644 --- a/tools/deseq-lrt.cwl +++ b/tools/deseq-lrt.cwl @@ -8,7 +8,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/scidap-deseq:v0.0.24 + dockerPull: biowardrobe2/scidap-deseq:v0.0.26 inputs: diff --git a/tools/group-isoforms.cwl b/tools/group-isoforms.cwl index 9a71fa9b..a9dd3ac4 100644 --- a/tools/group-isoforms.cwl +++ b/tools/group-isoforms.cwl @@ -8,7 +8,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/scidap-deseq:v0.0.24 + dockerPull: biowardrobe2/scidap-deseq:v0.0.26 inputs: diff --git a/tools/volcanot-plot.cwl b/tools/volcanot-plot.cwl new file mode 100644 index 00000000..c446f3e3 --- /dev/null +++ b/tools/volcanot-plot.cwl @@ -0,0 +1,95 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/visualization:v0.0.3 + + +inputs: + + diff_expr_file: + type: File? + doc: | + This input is not being used for now, but we need it so we can + guarantee the order of workflow steps execution when we use this + tool in a workflow. + + +outputs: + + css_file: + type: File + outputBinding: + glob: "./html_data/index.css" + doc: | + CSS file for Volcano Plot + + js_file: + type: File + outputBinding: + glob: "./html_data/index.js" + doc: | + Javascript file for Volcano Plot + + html_file: + type: File + outputBinding: + glob: "./html_data/index.html" + doc: | + HTML index file for Volcano Plot + + +baseCommand: ["volcano_plot.sh"] + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Volcano Plot" +s:name: "Volcano Plot" +s:alternateName: "Builds volcano plot from the DESeq output" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/volcanot-plot.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + +doc: | + Volcano Plot + + Builds volcano plot from the DESeq output diff --git a/workflows/deseq.cwl b/workflows/deseq.cwl index 434f55e0..4af2e643 100644 --- a/workflows/deseq.cwl +++ b/workflows/deseq.cwl @@ -151,6 +151,82 @@ inputs: 'sd:layout': advanced: true + cluster_method: + type: + - "null" + - type: enum + symbols: + - "row" + - "column" + - "both" + - "none" + default: "none" + label: "Hopach clustering method to be run on normalized read counts" + doc: | + Hopach clustering method to be run on normalized read counts for the + exploratory visualization analysis. Default: do not run clustering + 'sd:layout': + advanced: true + + row_distance: + type: + - "null" + - type: enum + symbols: + - "cosangle" + - "abscosangle" + - "euclid" + - "abseuclid" + - "cor" + - "abscor" + default: "cosangle" + label: "Distance metric for HOPACH row clustering" + doc: | + Distance metric for HOPACH row clustering. Ignored if --cluster is not + provided. Default: cosangle + 'sd:layout': + advanced: true + + column_distance: + type: + - "null" + - type: enum + symbols: + - "cosangle" + - "abscosangle" + - "euclid" + - "abseuclid" + - "cor" + - "abscor" + default: "euclid" + label: "Distance metric for HOPACH column clustering" + doc: | + Distance metric for HOPACH column clustering. Ignored if --cluster is not + provided. Default: euclid + 'sd:layout': + advanced: true + + center_row: + type: boolean? + default: false + label: "Apply mean centering for feature expression prior to running clustering by row" + doc: | + Apply mean centering for feature expression prior to running + clustering by row. Ignored when --cluster is not row or both. + Default: do not centered + 'sd:layout': + advanced: true + + maximum_padj: + type: float? + default: 0.05 + label: "Maximum P-adjusted to show features in the exploratory visualization analysis" + doc: | + In the exploratory visualization analysis output only features with + adjusted P-value not bigger than this value. Default: 0.05 + 'sd:layout': + advanced: true + sample_names_cond_1: type: - "null" @@ -217,6 +293,18 @@ outputs: doc: "DESeq generated file with phenotypes in CLS format. Compatible with GSEA" outputSource: deseq/phenotypes_file + mds_plot_html: + type: File? + outputSource: deseq/mds_plot_html + label: "MDS plot of normalized counts" + doc: | + MDS plot of normalized counts + HTML format + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + plot_lfc_vs_mean: type: File? label: "Plot of normalised mean versus log2 fold change" @@ -283,6 +371,42 @@ outputs: homoskedastic (have constant variance along the range of mean values) outputSource: deseq/plot_pca_pdf + volcano_plot_html_file: + type: File + outputSource: make_volcano_plot/html_file + label: "Volcano Plot" + doc: | + HTML index file with volcano plot data. + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + volcano_plot_css_file: + type: File + outputSource: make_volcano_plot/css_file + label: "Volcano Plot CSS" + doc: | + CSS index file with volcano plot data. + + volcano_plot_js_file: + type: File + outputSource: make_volcano_plot/js_file + label: "Volcano Plot JS" + doc: | + JS index file with volcano plot data. + + heatmap_html: + type: File + outputSource: morpheus_heatmap/heatmap_html + label: "Heatmap of normalized counts" + doc: | + Morpheus heatmap in HTML format + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + deseq_stdout_log: type: File format: "http://edamontology.org/format_2330" @@ -297,6 +421,18 @@ outputs: doc: "DESeq stderr log" outputSource: deseq/stderr_log + morpheus_stdout_log: + type: File + outputSource: morpheus_heatmap/stdout_log + label: "Morpheus heatmap stdout log" + doc: "Morpheus heatmap stdout log" + + morpheus_stderr_log: + type: File + outputSource: morpheus_heatmap/stderr_log + label: "Morpheus heatmap stderr log" + doc: "Morpheus heatmap stderr log" + steps: @@ -333,6 +469,13 @@ steps: treated_sample_names: sample_names_cond_2 rpkm_cutoff: rpkm_cutoff batch_file: batch_file + cluster_method: + source: cluster_method + valueFrom: $(self=="none"?null:self) + row_distance: row_distance + column_distance: column_distance + center_row: center_row + maximum_padj: maximum_padj threads: threads out: - diff_expr_file @@ -344,9 +487,28 @@ steps: - plot_pca_pdf - read_counts_file - phenotypes_file + - mds_plot_html - stdout_log - stderr_log + make_volcano_plot: + run: ../tools/volcanot-plot.cwl + in: + diff_expr_file: deseq/diff_expr_file # we need this input only to guarantee workflow steps execution order + out: + - html_file + - css_file + - js_file + + morpheus_heatmap: + run: ../tools/morpheus-heatmap.cwl + in: + read_counts_gct: deseq/read_counts_file + out: + - heatmap_html + - stdout_log + - stderr_log + $namespaces: s: http://schema.org/ From 711ae73c0639c827171f0526abfd62d669ee4bff Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 30 Nov 2022 16:41:45 -0500 Subject: [PATCH 004/162] Update VOlcano Plot to the latest docker image --- tools/volcanot-plot.cwl | 37 ++++++++++++++++++------ workflows/deseq-multi-factor.cwl | 48 ++++++++++++++++++++++++++++++++ workflows/deseq.cwl | 18 +++++++++++- 3 files changed, 94 insertions(+), 9 deletions(-) diff --git a/tools/volcanot-plot.cwl b/tools/volcanot-plot.cwl index c446f3e3..fdbb1085 100644 --- a/tools/volcanot-plot.cwl +++ b/tools/volcanot-plot.cwl @@ -4,17 +4,38 @@ class: CommandLineTool hints: - class: DockerRequirement - dockerPull: biowardrobe2/visualization:v0.0.3 + dockerPull: biowardrobe2/visualization:v0.0.4 inputs: diff_expr_file: - type: File? + type: File + inputBinding: + position: 5 + doc: | + TSV file holding data for the plot + + x_axis_column: + type: string + inputBinding: + position: 6 + doc: | + Name of column in file for the plots x-axis (ex: "log2FoldChange") + + y_axis_column: + type: string + inputBinding: + position: 7 + doc: | + Name of column in file for the plots y-axis (ex: "padj") + + label_column: + type: string + inputBinding: + position: 8 doc: | - This input is not being used for now, but we need it so we can - guarantee the order of workflow steps execution when we use this - tool in a workflow. + Name of column in file for each data points 'name' (ex: "GeneId") outputs: @@ -22,21 +43,21 @@ outputs: css_file: type: File outputBinding: - glob: "./html_data/index.css" + glob: "./volcano_plot/volcano_plot/html_data/index.css" doc: | CSS file for Volcano Plot js_file: type: File outputBinding: - glob: "./html_data/index.js" + glob: "./volcano_plot/volcano_plot/html_data/index.js" doc: | Javascript file for Volcano Plot html_file: type: File outputBinding: - glob: "./html_data/index.html" + glob: "./volcano_plot/volcano_plot/html_data/index.html" doc: | HTML index file for Volcano Plot diff --git a/workflows/deseq-multi-factor.cwl b/workflows/deseq-multi-factor.cwl index d9bf46e8..1613820e 100644 --- a/workflows/deseq-multi-factor.cwl +++ b/workflows/deseq-multi-factor.cwl @@ -322,6 +322,31 @@ outputs: tab: 'Plots' Caption: 'PCA plot of normalized counts based on the top 500 features with the highest row variance' + volcano_plot_html_file: + type: File + outputSource: make_volcano_plot/html_file + label: "Volcano Plot" + doc: | + HTML index file with volcano plot data. + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + volcano_plot_css_file: + type: File + outputSource: make_volcano_plot/css_file + label: "Volcano Plot CSS" + doc: | + CSS index file with volcano plot data. + + volcano_plot_js_file: + type: File + outputSource: make_volcano_plot/js_file + label: "Volcano Plot JS" + doc: | + JS index file with volcano plot data. + heatmap_html: type: File outputSource: morpheus_heatmap/heatmap_html @@ -413,6 +438,29 @@ steps: - stdout_log - stderr_log + make_volcano_plot: + run: ../tools/volcanot-plot.cwl + in: + diff_expr_file: deseq_multi_factor/diff_expr_features + x_axis_column: + default: "log2FoldChange" + y_axis_column: + default: "padj" + label_column: + source: feature_type + valueFrom: | + ${ + if (self == "transcript") { + return "feature"; + } else { + return "GeneId"; + } + } + out: + - html_file + - css_file + - js_file + morpheus_heatmap: run: ../tools/morpheus-heatmap.cwl in: diff --git a/workflows/deseq.cwl b/workflows/deseq.cwl index 4af2e643..842bc65c 100644 --- a/workflows/deseq.cwl +++ b/workflows/deseq.cwl @@ -494,7 +494,23 @@ steps: make_volcano_plot: run: ../tools/volcanot-plot.cwl in: - diff_expr_file: deseq/diff_expr_file # we need this input only to guarantee workflow steps execution order + diff_expr_file: deseq/diff_expr_file + x_axis_column: + default: "log2FoldChange" + y_axis_column: + default: "padj" + label_column: + source: group_by + valueFrom: | + ${ + if (self == "isoforms") { + return "RefseqId"; + } else if (self == "genes") { + return "GeneId"; + } else { + return "GeneId"; + } + } out: - html_file - css_file From d5f6ceff859b7c44704fd4e31fe01412a535749a Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 13 Dec 2022 10:17:36 -0500 Subject: [PATCH 005/162] Update volcano plot to the latest --- tools/volcanot-plot.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/volcanot-plot.cwl b/tools/volcanot-plot.cwl index fdbb1085..831dccea 100644 --- a/tools/volcanot-plot.cwl +++ b/tools/volcanot-plot.cwl @@ -4,7 +4,7 @@ class: CommandLineTool hints: - class: DockerRequirement - dockerPull: biowardrobe2/visualization:v0.0.4 + dockerPull: biowardrobe2/visualization:v0.0.5 inputs: From 72dcb1a1e592794cbef3f2f74bb19d0a644f6978 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 21 Dec 2022 13:46:05 -0500 Subject: [PATCH 006/162] Update GSEAPy worfkflow to support the latest GCT format --- tools/gseapy.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/gseapy.cwl b/tools/gseapy.cwl index e14e361b..744aad83 100644 --- a/tools/gseapy.cwl +++ b/tools/gseapy.cwl @@ -8,7 +8,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/gseapy:v0.0.5 + dockerPull: biowardrobe2/gseapy:v0.0.6 inputs: From 9429fe8dacc6e7b401dd70e40b19ed6fc6789535 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 10 Jan 2023 12:55:43 -0500 Subject: [PATCH 007/162] Fix bug in the Morpheus row annotation types --- tools/morpheus-heatmap.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/morpheus-heatmap.cwl b/tools/morpheus-heatmap.cwl index a2e8f8ff..e71428b9 100644 --- a/tools/morpheus-heatmap.cwl +++ b/tools/morpheus-heatmap.cwl @@ -8,7 +8,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/morpheus:v0.0.1 + dockerPull: biowardrobe2/morpheus:v0.0.2 inputs: From 2d04993e44d08f009a37226713d57ce388571911 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 27 Jan 2023 14:45:40 -0500 Subject: [PATCH 008/162] Update old DESeq pipeline to export baseMean column required for MA-plot --- tools/deseq-advanced.cwl | 2 +- tools/deseq-lrt.cwl | 2 +- tools/group-isoforms.cwl | 2 +- workflows/filter-deseq-for-heatmap.cwl | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/deseq-advanced.cwl b/tools/deseq-advanced.cwl index 91815ae0..70ea512e 100644 --- a/tools/deseq-advanced.cwl +++ b/tools/deseq-advanced.cwl @@ -8,7 +8,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/scidap-deseq:v0.0.26 + dockerPull: biowardrobe2/scidap-deseq:v0.0.27 inputs: diff --git a/tools/deseq-lrt.cwl b/tools/deseq-lrt.cwl index 801c9e7b..f282c3a9 100644 --- a/tools/deseq-lrt.cwl +++ b/tools/deseq-lrt.cwl @@ -8,7 +8,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/scidap-deseq:v0.0.26 + dockerPull: biowardrobe2/scidap-deseq:v0.0.27 inputs: diff --git a/tools/group-isoforms.cwl b/tools/group-isoforms.cwl index a9dd3ac4..e89c7bad 100644 --- a/tools/group-isoforms.cwl +++ b/tools/group-isoforms.cwl @@ -8,7 +8,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/scidap-deseq:v0.0.26 + dockerPull: biowardrobe2/scidap-deseq:v0.0.27 inputs: diff --git a/workflows/filter-deseq-for-heatmap.cwl b/workflows/filter-deseq-for-heatmap.cwl index f2928fcd..f105dd9b 100644 --- a/workflows/filter-deseq-for-heatmap.cwl +++ b/workflows/filter-deseq-for-heatmap.cwl @@ -36,8 +36,8 @@ inputs: doc: "Filtering parameters (WHERE parameters for SQL query)" 'sd:filtering': params: - columns: ["RefseqId", "GeneId", "Chrom", "TxStart", "TxEnd", "Strand", "RpkmCondition1", "RpkmCondition2", "log2FoldChange", "pvalue", "padj"] - types: ["string", "string", "string", "number", "number", "string", "number", "number", "number", "number", "number"] + columns: ["RefseqId", "GeneId", "Chrom", "TxStart", "TxEnd", "Strand", "RpkmCondition1", "RpkmCondition2", "baseMean", "log2FoldChange", "pvalue", "padj"] + types: ["string", "string", "string", "number", "number", "string", "number", "number", "number", "number", "number", "number"] header: type: boolean? From da191fc4cefe5e617f27b9470cece7794788d7ad Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 6 Feb 2023 17:18:50 -0500 Subject: [PATCH 009/162] Add MA-plot to DESeq pipelines --- tools/ma-plot.cwl | 109 ++++++++++++++++++ tools/{volcanot-plot.cwl => volcano-plot.cwl} | 17 +-- workflows/deseq-multi-factor.cwl | 60 +++++++--- workflows/deseq.cwl | 62 +++++++--- 4 files changed, 208 insertions(+), 40 deletions(-) create mode 100644 tools/ma-plot.cwl rename tools/{volcanot-plot.cwl => volcano-plot.cwl} (87%) diff --git a/tools/ma-plot.cwl b/tools/ma-plot.cwl new file mode 100644 index 00000000..792736f2 --- /dev/null +++ b/tools/ma-plot.cwl @@ -0,0 +1,109 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/visualization:v0.0.6 + + +inputs: + + diff_expr_file: + type: File + inputBinding: + position: 5 + doc: | + TSV file holding data for the plot + + x_axis_column: + type: string + inputBinding: + position: 6 + doc: | + Name of column in file for the plots x-axis (ex: "baseMean") + + y_axis_column: + type: string + inputBinding: + position: 7 + doc: | + Name of column in file for the plots y-axis (ex: "log2FoldChange") + + label_column: + type: string + inputBinding: + position: 8 + doc: | + Name of column in file for each data points 'name' (ex: "GeneId") + + +outputs: + + html_data: + type: Directory + outputBinding: + glob: "./volcano_plot/MD-MA_plot" + doc: | + Directory html data for MA-plot + + html_file: + type: File + outputBinding: + glob: "./volcano_plot/MD-MA_plot/html_data/index.html" + doc: | + HTML index file for MA-plot + + +baseCommand: ["ma_plot.sh"] + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "MA-plot" +s:name: "MA-plot" +s:alternateName: "Builds ma-plot from the DESeq output" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/ma-plot.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + +doc: | + MA-plot + + Builds ma-plot from the DESeq output diff --git a/tools/volcanot-plot.cwl b/tools/volcano-plot.cwl similarity index 87% rename from tools/volcanot-plot.cwl rename to tools/volcano-plot.cwl index 831dccea..133fc4e3 100644 --- a/tools/volcanot-plot.cwl +++ b/tools/volcano-plot.cwl @@ -4,7 +4,7 @@ class: CommandLineTool hints: - class: DockerRequirement - dockerPull: biowardrobe2/visualization:v0.0.5 + dockerPull: biowardrobe2/visualization:v0.0.6 inputs: @@ -40,19 +40,12 @@ inputs: outputs: - css_file: - type: File - outputBinding: - glob: "./volcano_plot/volcano_plot/html_data/index.css" - doc: | - CSS file for Volcano Plot - - js_file: - type: File + html_data: + type: Directory outputBinding: - glob: "./volcano_plot/volcano_plot/html_data/index.js" + glob: "./volcano_plot/volcano_plot" doc: | - Javascript file for Volcano Plot + Directory html data for Volcano Plot html_file: type: File diff --git a/workflows/deseq-multi-factor.cwl b/workflows/deseq-multi-factor.cwl index 1613820e..7b342ff6 100644 --- a/workflows/deseq-multi-factor.cwl +++ b/workflows/deseq-multi-factor.cwl @@ -327,25 +327,36 @@ outputs: outputSource: make_volcano_plot/html_file label: "Volcano Plot" doc: | - HTML index file with volcano plot data. + HTML index file for Volcano Plot 'sd:visualPlugins': - linkList: tab: 'Overview' target: "_blank" - volcano_plot_css_file: - type: File - outputSource: make_volcano_plot/css_file - label: "Volcano Plot CSS" + volcano_plot_html_data: + type: Directory + outputSource: make_volcano_plot/html_data + label: "Directory html data for Volcano Plot" doc: | - CSS index file with volcano plot data. + Directory html data for Volcano Plot - volcano_plot_js_file: + ma_plot_html_file: type: File - outputSource: make_volcano_plot/js_file - label: "Volcano Plot JS" + outputSource: make_ma_plot/html_file + label: "MA-plot" + doc: | + HTML index file for MA-plot + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + ma_plot_html_data: + type: Directory + outputSource: make_ma_plot/html_data + label: "Directory html data for Volcano Plot" doc: | - JS index file with volcano plot data. + Directory html data for MA-plot heatmap_html: type: File @@ -439,7 +450,7 @@ steps: - stderr_log make_volcano_plot: - run: ../tools/volcanot-plot.cwl + run: ../tools/volcano-plot.cwl in: diff_expr_file: deseq_multi_factor/diff_expr_features x_axis_column: @@ -457,9 +468,30 @@ steps: } } out: - - html_file - - css_file - - js_file + - html_data + - html_file + + make_ma_plot: + run: ../tools/ma-plot.cwl + in: + diff_expr_file: deseq_multi_factor/diff_expr_features + x_axis_column: + default: "baseMean" + y_axis_column: + default: "log2FoldChange" + label_column: + source: feature_type + valueFrom: | + ${ + if (self == "transcript") { + return "feature"; + } else { + return "GeneId"; + } + } + out: + - html_data + - html_file morpheus_heatmap: run: ../tools/morpheus-heatmap.cwl diff --git a/workflows/deseq.cwl b/workflows/deseq.cwl index 842bc65c..994f4204 100644 --- a/workflows/deseq.cwl +++ b/workflows/deseq.cwl @@ -376,25 +376,36 @@ outputs: outputSource: make_volcano_plot/html_file label: "Volcano Plot" doc: | - HTML index file with volcano plot data. + HTML index file for Volcano Plot 'sd:visualPlugins': - linkList: tab: 'Overview' target: "_blank" - volcano_plot_css_file: - type: File - outputSource: make_volcano_plot/css_file - label: "Volcano Plot CSS" + volcano_plot_html_data: + type: Directory + outputSource: make_volcano_plot/html_data + label: "Directory html data for Volcano Plot" doc: | - CSS index file with volcano plot data. + Directory html data for Volcano Plot - volcano_plot_js_file: + ma_plot_html_file: type: File - outputSource: make_volcano_plot/js_file - label: "Volcano Plot JS" + outputSource: make_ma_plot/html_file + label: "MA-plot" + doc: | + HTML index file for MA-plot + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + ma_plot_html_data: + type: Directory + outputSource: make_ma_plot/html_data + label: "Directory html data for Volcano Plot" doc: | - JS index file with volcano plot data. + Directory html data for MA-plot heatmap_html: type: File @@ -492,7 +503,7 @@ steps: - stderr_log make_volcano_plot: - run: ../tools/volcanot-plot.cwl + run: ../tools/volcano-plot.cwl in: diff_expr_file: deseq/diff_expr_file x_axis_column: @@ -512,9 +523,32 @@ steps: } } out: - - html_file - - css_file - - js_file + - html_data + - html_file + + make_ma_plot: + run: ../tools/ma-plot.cwl + in: + diff_expr_file: deseq/diff_expr_file + x_axis_column: + default: "baseMean" + y_axis_column: + default: "log2FoldChange" + label_column: + source: group_by + valueFrom: | + ${ + if (self == "isoforms") { + return "RefseqId"; + } else if (self == "genes") { + return "GeneId"; + } else { + return "GeneId"; + } + } + out: + - html_data + - html_file morpheus_heatmap: run: ../tools/morpheus-heatmap.cwl From 5e39de699e57b770f99fcefee0ca6b9def363572 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 6 Mar 2023 11:24:46 -0500 Subject: [PATCH 010/162] Update Volcano and MA plots to use the latest dockerfile --- tools/ma-plot.cwl | 2 +- tools/volcano-plot.cwl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/ma-plot.cwl b/tools/ma-plot.cwl index 792736f2..da5bbf74 100644 --- a/tools/ma-plot.cwl +++ b/tools/ma-plot.cwl @@ -4,7 +4,7 @@ class: CommandLineTool hints: - class: DockerRequirement - dockerPull: biowardrobe2/visualization:v0.0.6 + dockerPull: biowardrobe2/visualization:v0.0.7 inputs: diff --git a/tools/volcano-plot.cwl b/tools/volcano-plot.cwl index 133fc4e3..c834dc22 100644 --- a/tools/volcano-plot.cwl +++ b/tools/volcano-plot.cwl @@ -4,7 +4,7 @@ class: CommandLineTool hints: - class: DockerRequirement - dockerPull: biowardrobe2/visualization:v0.0.6 + dockerPull: biowardrobe2/visualization:v0.0.7 inputs: From 06c12872d18f12bcb7139111c3323e78d45e8404 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 6 Mar 2023 12:01:51 -0500 Subject: [PATCH 011/162] Fix bug in the selecting a proper column for Volcano/MA plot --- workflows/deseq-multi-factor.cwl | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/workflows/deseq-multi-factor.cwl b/workflows/deseq-multi-factor.cwl index 7b342ff6..fe3f4cd0 100644 --- a/workflows/deseq-multi-factor.cwl +++ b/workflows/deseq-multi-factor.cwl @@ -458,15 +458,7 @@ steps: y_axis_column: default: "padj" label_column: - source: feature_type - valueFrom: | - ${ - if (self == "transcript") { - return "feature"; - } else { - return "GeneId"; - } - } + default: "feature" out: - html_data - html_file @@ -480,15 +472,7 @@ steps: y_axis_column: default: "log2FoldChange" label_column: - source: feature_type - valueFrom: | - ${ - if (self == "transcript") { - return "feature"; - } else { - return "GeneId"; - } - } + default: "feature" out: - html_data - html_file From fbfa6b8aff1a0b44f8d64f92f4a9bf5f2e1c0c42 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 10 Mar 2023 14:25:56 -0500 Subject: [PATCH 012/162] Add DiffBind Multi-factor Analysis --- tools/diffbind-multi-factor.cwl | 636 ++++++++++++++++++ tools/diffbind.cwl | 2 +- workflows/diffbind-multi-factor.cwl | 975 ++++++++++++++++++++++++++++ 3 files changed, 1612 insertions(+), 1 deletion(-) create mode 100644 tools/diffbind-multi-factor.cwl create mode 100644 workflows/diffbind-multi-factor.cwl diff --git a/tools/diffbind-multi-factor.cwl b/tools/diffbind-multi-factor.cwl new file mode 100644 index 00000000..b1342aaa --- /dev/null +++ b/tools/diffbind-multi-factor.cwl @@ -0,0 +1,636 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: DockerRequirement + dockerPull: biowardrobe2/diffbind:v0.0.15 +- class: InlineJavascriptRequirement +- class: InitialWorkDirRequirement + listing: | + ${ + var listing = []; + for (var i = 0; i < inputs.alignment_files.length; i++){ + var alignment_file = inputs.alignment_files[i]; + var prefix = "u" + i + "_"; + alignment_file.basename = prefix + alignment_file.basename; + if (alignment_file.secondaryFiles && alignment_file.secondaryFiles.length > 0){ + for (var j = 0; j < alignment_file.secondaryFiles.length; j++){ + var secondary_file = alignment_file.secondaryFiles[j]; + secondary_file.basename = prefix + secondary_file.basename; + listing.push(secondary_file); + } + delete alignment_file.secondaryFiles; + } + listing.push(alignment_file); + } + return listing; + } + + +inputs: + + alignment_files: + type: File[] + secondaryFiles: + - .bai + inputBinding: + prefix: "--alignments" + doc: + Sorted and indexed alignment files in bam format + + peak_files: + type: File[] + inputBinding: + prefix: "--peaks" + doc: + Peak files in the MACS2 xls format. Number and order of the + files should correspond to the files provided in --alignments + parameter. + + dataset_names: + type: string[] + inputBinding: + prefix: "--aliases" + doc: | + Unique names for datasets provided in --alignments and --peaks + parameters, no special characters or spaces are allowed. Number + and order of the names should correspond to the values provided + in --alignments and --peaks parameters. + + metadata_file: + type: File + inputBinding: + prefix: "--metadata" + doc: | + TSV/CSV metadata file to describe datasets provided in --alignments + and --peaks parameters. First column should have the name 'sample', + all other columns names should be selected from the following list: + Tissue, Factor, Condition, Treatment, Caller, Replicate. The values + from the 'sample' column should correspond to the values provided in + --aliases parameter. For a proper --contrast intepretation, values + defined in each metadata column should not be used in any of the other + columns. All metadata columns are treated as factors (no covariates + are supported). + + scoreby: + type: + - "null" + - type: enum + symbols: + - "pvalue" + - "qvalue" + inputBinding: + prefix: "--scoreby" + doc: | + Score metrics to build peak overlap correlation heatmap and exclude low + quality peaks based on the threshold provided in --score parameter. + Default: pvalue + + score_threshold: + type: float? + inputBinding: + prefix: "--score" + doc: | + Filtering threshold to keep only those peaks where the metric selected + in --scoreby parameter is less than or equal to the provided value. + Default: 0.05 + + rpkm_threshold: + type: float? + inputBinding: + prefix: "--minrpkm" + doc: | + Filtering threshold to keep only those peaks where the max RPKM for + all datasets is bigger than or equal to the provided value. + Default: 1 + + overlap_threshold: + type: float? + inputBinding: + prefix: "--minoverlap" + doc: | + Filtering threshold to keep only those peaks that are present in at + least this many datasets when generating consensus set of peaks used + in differential analysis. If this threshold has a value between zero + and one, only those peaks will be included that are present in at least + this proportion of datasets. When combined with --groupby parameter, + --minoverlap threshold is applied per group, then union of the resulted + peaks are used in the differential analysis. Default: 2 + + groupby: + type: + - "null" + - string + - string[] + inputBinding: + prefix: "--groupby" + doc: | + Column(s) from the metadata table to define datasets grouping. --minoverlap + filtering threshold will be applied within each group independently. Union + of the resulted peaks from each of the groups will be used in the differential + analysis. Default: apply --minoverlap filtering threshold for all datasets + jointly + + design_formula: + type: string + inputBinding: + prefix: "--design" + doc: | + Design formula comprised of the metadata columns names. + It should start with ~ + + contrast: + type: string? + inputBinding: + prefix: "--contrast" + doc: | + Contrast applied to the analysis results when calculating log2 fold changes. + It should be formatted as a mathematical formula of values present in the + metadata table. It is a required parameter if --method is set to edger. If not + provided and --method is set to deseq2, the last term from the design formula + will be used. + + base_levels: + type: + - "null" + - string + - string[] + inputBinding: + prefix: "--base" + doc: | + Base levels for each of the metadata columns. Number and order of the provided + values should correspond to the metadata columns. Default: define base levels + alphabetically. + + analysis_method: + type: + - "null" + - type: enum + symbols: + - "deseq2" + - "edger" + inputBinding: + prefix: "--method" + doc: | + Method used in the differential binding analysis. Should be equal to + either edger or deseq2. + Default: deseq2 + + normalization_method: + type: + - "null" + - type: enum + symbols: + - "auto" + - "rle" + - "tmm" + - "lib" + inputBinding: + prefix: "--norm" + doc: | + Normalization technique applied to the read counts before running differential + binding analysis. When set to auto selects rle for deseq2 and tmm for edger. + Default: auto + + padj_threshold: + type: float? + inputBinding: + prefix: "--padj" + doc: | + Filtering threshold to report only differentially bound sites with adjusted + P-value less than or equal to the provided value. + Default: 0.05 + + cluster_method: + type: + - "null" + - type: enum + symbols: + - "row" + - "column" + - "both" + inputBinding: + prefix: "--cluster" + doc: | + Hopach clustering method to be run on normalized read counts. + Default: do not run clustering + + row_distance: + type: + - "null" + - type: enum + symbols: + - "cosangle" + - "abscosangle" + - "euclid" + - "abseuclid" + - "cor" + - "abscor" + inputBinding: + prefix: "--rowdist" + doc: | + Distance metric for HOPACH row clustering. Ignored if --cluster is not + provided. + Default: cosangle + + column_distance: + type: + - "null" + - type: enum + symbols: + - "cosangle" + - "abscosangle" + - "euclid" + - "abseuclid" + - "cor" + - "abscor" + inputBinding: + prefix: "--columndist" + doc: | + Distance metric for HOPACH column clustering. Ignored if --cluster is not + provided. + Default: euclid + + center_row: + type: boolean? + inputBinding: + prefix: "--center" + doc: | + Apply mean centering for normalized read counts prior to running + clustering by row. Ignored when --cluster is not row or both. + Default: do not centered + + export_pdf_plots: + type: boolean? + inputBinding: + prefix: "--pdf" + doc: | + Export plots in PDF. + Default: false + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix for generated files + Default: ./diffbind + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. + Default: 1 + + +outputs: + + pk_vrlp_s_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_pk_vrlp_s*.png" + doc: | + Peakset overlap rate + PNG format + + pk_vrlp_s_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_pk_vrlp_s*.pdf" + doc: | + Peakset overlap rate + PDF format + + pk_scr_corr_plot_png: + type: File? + outputBinding: + glob: "*_pk_scr_corr.png" + doc: | + Datasets correlation (peak score) + PNG format + + pk_scr_corr_plot_pdf: + type: File? + outputBinding: + glob: "*_pk_scr_corr.pdf" + doc: | + Datasets correlation (peak score) + PDF format + + rw_rds_corr_plot_png: + type: File? + outputBinding: + glob: "*_rw_rds_corr.png" + doc: | + Datasets correlation (raw reads) + PNG format + + rw_rds_corr_plot_pdf: + type: File? + outputBinding: + glob: "*_rw_rds_corr.pdf" + doc: | + Datasets correlation (raw reads) + PDF format + + nr_rds_corr_plot_png: + type: File? + outputBinding: + glob: "*_nr_rds_corr.png" + doc: | + Datasets correlation (normalized reads) + PNG format + + nr_rds_corr_plot_pdf: + type: File? + outputBinding: + glob: "*_nr_rds_corr.pdf" + doc: | + Datasets correlation (normalized reads) + PDF format + + diff_vlcn_plot_png: + type: File? + outputBinding: + glob: "*_diff_vlcn.png" + doc: | + Volcano plot for differentially bound sites + PNG format + + diff_vlcn_plot_pdf: + type: File? + outputBinding: + glob: "*_diff_vlcn.pdf" + doc: | + Volcano plot for differentially bound sites + PDF format + + diff_ma_plot_png: + type: File? + outputBinding: + glob: "*_diff_ma.png" + doc: | + MA-plot for differentially bound sites + PNG format + + diff_ma_plot_pdf: + type: File? + outputBinding: + glob: "*_diff_ma.pdf" + doc: | + MA-plot for differentially bound sites + PDF format + + nr_rds_pca_1_2_plot_png: + type: File? + outputBinding: + glob: "*_nr_rds_pca_1_2.png" + doc: | + PCA (1,2) of not filtered normalized counts + PNG format + + nr_rds_pca_1_2_plot_pdf: + type: File? + outputBinding: + glob: "*_nr_rds_pca_1_2.pdf" + doc: | + PCA (1,2) of not filtered normalized counts + PDF format + + nr_rds_pca_2_3_plot_png: + type: File? + outputBinding: + glob: "*_nr_rds_pca_2_3.png" + doc: | + PCA (2,3) of not filtered normalized counts + PNG format + + nr_rds_pca_2_3_plot_pdf: + type: File? + outputBinding: + glob: "*_nr_rds_pca_2_3.pdf" + doc: | + PCA (2,3) of not filtered normalized counts + PDF format + + nr_rds_mds_html: + type: File? + outputBinding: + glob: "*_nr_rds_mds.html" + doc: | + MDS plot of normalized counts. + HTML format + + pk_prfl_plot_png: + type: File? + outputBinding: + glob: "*_pk_prfl.png" + doc: | + Peak profiles + PNG format + + pk_prfl_plot_pdf: + type: File? + outputBinding: + glob: "*_pk_prfl.pdf" + doc: | + Peak profiles + PDF format + + diff_sts_tsv: + type: File? + outputBinding: + glob: "*_diff_sts.tsv" + doc: | + Differentially bound sites. Not filtered. + TSV format + + nr_rds_gct: + type: File? + outputBinding: + glob: "*_nr_rds.gct" + doc: | + Normalized filtered by padj read counts + GCT format + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["run_diffbind_manual.R"] +stdout: diffbind_manual_stdout.log +stderr: diffbind_manual_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "DiffBind Multi-factor Analysis" +s:name: "DiffBind Multi-factor Analysis" +s:alternateName: "Runs DiffBind multi-factor analysis with manual control over major parameters" + + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/diffbind-multi-factor.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + DiffBind Multi-factor Analysis + + Runs DiffBind multi-factor analysis with manual control over major parameters + + +s:about: | + usage: run_diffbind_manual.R + [-h] --alignments ALIGNMENTS [ALIGNMENTS ...] --peaks PEAKS [PEAKS ...] + --aliases ALIASES [ALIASES ...] --metadata METADATA + [--scoreby {pvalue,qvalue}] [--score SCORE] [--minrpkm MINRPKM] + [--minoverlap MINOVERLAP] [--groupby [GROUPBY ...]] --design DESIGN + [--contrast CONTRAST] [--base [BASE ...]] [--method {edger,deseq2}] + [--norm {auto,rle,tmm,lib}] [--padj PADJ] [--cluster {row,column,both}] + [--rowdist {cosangle,abscosangle,euclid,abseuclid,cor,abscor}] + [--columndist {cosangle,abscosangle,euclid,abseuclid,cor,abscor}] + [--center] [--pdf] [--output OUTPUT] [--cpus CPUS] + + DiffBind Multi-factor Analysis + + options: + -h, --help show this help message and exit + --alignments ALIGNMENTS [ALIGNMENTS ...] + Sorted and indexed alignment files in bam format. + --peaks PEAKS [PEAKS ...] + Peak files in the MACS2 xls format. Number and order + of the files should correspond to the files provided + in --alignments parameter. + --aliases ALIASES [ALIASES ...] + Unique names for datasets provided in --alignments and + --peaks parameters, no special characters or spaces + are allowed. Number and order of the names should + correspond to the values provided in --alignments and + --peaks parameters. + --metadata METADATA TSV/CSV metadata file to describe datasets provided in + --alignments and --peaks parameters. First column + should have the name 'sample', all other columns names + should be selected from the following list: Tissue, + Factor, Condition, Treatment, Caller, Replicate. The + values from the 'sample' column should correspond to + the values provided in --aliases parameter. For a + proper --contrast intepretation, values defined in + each metadata column should not be used in any of the + other columns. All metadata columns are treated as + factors (no covariates are supported). + --scoreby {pvalue,qvalue} + Score metrics to build peak overlap correlation + heatmap and exclude low quality peaks based on the + threshold provided in --score parameter. Default: + pvalue + --score SCORE Filtering threshold to keep only those peaks where the + metric selected in --scoreby parameter is less than or + equal to the provided value. Default: 0.05 + --minrpkm MINRPKM Filtering threshold to keep only those peaks where the + max RPKM for all datasets is bigger than or equal to + the provided value. Default: 1 + --minoverlap MINOVERLAP + Filtering threshold to keep only those peaks that are + present in at least this many datasets when generating + consensus set of peaks used in differential analysis. + If this threshold has a value between zero and one, + only those peaks will be included that are present in + at least this proportion of datasets. When combined + with --groupby parameter, --minoverlap threshold is + applied per group, then union of the resulted peaks + are used in the differential analysis. Default: 2 + --groupby [GROUPBY ...] + Column(s) from the metadata table to define datasets + grouping. --minoverlap filtering threshold will be + applied within each group independently. Union of the + resulted peaks from each of the groups will be used in + the differential analysis. Default: apply --minoverlap + filtering threshold for all datasets jointly + --design DESIGN Design formula comprised of the metadata columns + names. It should start with ~. + --contrast CONTRAST Contrast applied to the analysis results when + calculating log2 fold changes. It should be formatted + as a mathematical formula of values present in the + metadata table. It is a required parameter if --method + is set to edger. If not provided and --method is set + to deseq2, the last term from the design formula will + be used. + --base [BASE ...] Base levels for each of the metadata columns. Number + and order of the provided values should correspond to + the metadata columns. Default: define base levels + alphabetically. + --method {edger,deseq2} + Method used in the differential binding analysis. + Should be equal to either edger or deseq2. Default: + deseq2 + --norm {auto,rle,tmm,lib} + Normalization technique applied to the read counts + before running differential binding analysis. When set + to auto selects rle for deseq2 and tmm for edger. + Default: auto + --padj PADJ Filtering threshold to report only differentially + bound sites with adjusted P-value less than or equal + to the provided value. Default: 0.05 + --cluster {row,column,both} + Hopach clustering method to be run on normalized read + counts. Default: do not run clustering + --rowdist {cosangle,abscosangle,euclid,abseuclid,cor,abscor} + Distance metric for HOPACH row clustering. Ignored if + --cluster is not provided. Default: cosangle + --columndist {cosangle,abscosangle,euclid,abseuclid,cor,abscor} + Distance metric for HOPACH column clustering. Ignored + if --cluster is not provided. Default: euclid + --center Apply mean centering for normalized read counts prior + to running clustering by row. Ignored when --cluster + is not row or both. Default: do not centered + --pdf Export plots in PDF. Default: false + --output OUTPUT Output prefix for generated files + --cpus CPUS Number of cores/cpus to use. Default: 1 \ No newline at end of file diff --git a/tools/diffbind.cwl b/tools/diffbind.cwl index fa167978..d94b305c 100644 --- a/tools/diffbind.cwl +++ b/tools/diffbind.cwl @@ -4,7 +4,7 @@ class: CommandLineTool requirements: - class: DockerRequirement - dockerPull: biowardrobe2/diffbind:v0.0.13 + dockerPull: biowardrobe2/diffbind:v0.0.15 inputs: diff --git a/workflows/diffbind-multi-factor.cwl b/workflows/diffbind-multi-factor.cwl new file mode 100644 index 00000000..50b67f8f --- /dev/null +++ b/workflows/diffbind-multi-factor.cwl @@ -0,0 +1,975 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var split_by_common_delim = function(line) { + function get_unique(value, index, self) { + return self.indexOf(value) === index && value != ""; + } + var splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + + +'sd:upstream': + dna_experiment: + - "chipseq-se.cwl" + - "chipseq-pe.cwl" + - "trim-chipseq-se.cwl" + - "trim-chipseq-pe.cwl" + - "trim-atacseq-se.cwl" + - "trim-atacseq-pe.cwl" + genome_indices: + - "genome-indices.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name" + sd:preview: + position: 1 + + alignment_files: + type: File[] + secondaryFiles: + - .bai + label: "ChIP-Seq/ATAC-Seq experiments" + doc: | + Sorted and indexed alignment files + in BAM format + 'sd:upstreamSource': "dna_experiment/bambai_pair" + 'sd:localLabel': true + + peak_files: + type: File[] + label: "ChIP-Seq/ATAC-Seq experiments" + doc: + Peak files in the MACS2 xls format + 'sd:upstreamSource': "dna_experiment/macs2_called_peaks" + 'sd:localLabel': true + + dataset_names: + type: string[] + label: "ChIP-Seq/ATAC-Seq experiments" + doc: | + Unique names for datasets + 'sd:upstreamSource': "dna_experiment/alias" + 'sd:localLabel': true + + genome_coverage_files: + type: File[] + label: "ChIP-Seq/ATAC-Seq experiments" + doc: | + Genome coverage files in bigWig format + 'sd:upstreamSource': "dna_experiment/bigwig" + 'sd:localLabel': true + + narrow_peak_files: + type: + - "null" + - File[] + label: "ChIP-Seq/ATAC-Seq experiments" + doc: | + Called peaks files in narrowPeak format + 'sd:upstreamSource': "dna_experiment/macs2_narrow_peaks" + 'sd:localLabel': true + + broad_peak_files: + type: + - "null" + - File[] + label: "ChIP-Seq/ATAC-Seq experiments" + doc: | + Called peaks files in broadPeak format + 'sd:upstreamSource': "dna_experiment/macs2_broad_peaks" + 'sd:localLabel': true + + annotation_file: + type: File + label: "Reference genome" + doc: | + Genome annotation file in TSV format + 'sd:upstreamSource': "genome_indices/annotation" + 'sd:localLabel': true + + chrom_length_file: + type: File + label: "Reference genome" + doc: | + Chromosome length file in txt format + 'sd:upstreamSource': "genome_indices/chrom_length" + 'sd:localLabel': true + + scoreby: + type: + - "null" + - type: enum + symbols: + - "pvalue" + - "qvalue" + default: "pvalue" + label: "Score metrics to exclude low quality peaks" + doc: | + Score metrics to build peak overlap correlation + heatmap and exclude low quality peaks based on + the specific threshold value + + score_threshold: + type: float? + default: 0.05 + label: "Maximum allowed peak score (pvalue/qvalue)" + doc: | + Filtering threshold to keep only those peaks + where the selected metric is less than or equal + to the provided value + + metadata_file: + type: File + label: "Metadata file to describe datasets categories" + doc: | + Metadata file in TSV/CSV format to describe + input datasets categories. First column should + have the name 'sample', all other columns names + should be selected from the following list: + Tissue, Factor, Condition, Treatment, Caller, + Replicate. The values from the 'sample' column + should correspond to the names of the selected + ChIP-Seq/ATAC-Seq experiments. Values defined in + each metadata column should not be used in any of + the other columns. All metadata columns are treated + as factors (no covariates are supported). + + overlap_threshold: + type: float? + default: 2 + label: "Minimum peakset overlap threshold" + doc: | + Filtering threshold to keep only those peaks + that are present in at least this many datasets + when generating consensus set of peaks used in + differential analysis. If this threshold has a + value between zero and one, only those peaks + will be included that are present in at least + this proportion of datasets. If input datasets + are grouped by the certain metadata columns, + minimum peakset overlap threshold will be first + applied per group, then union of the resulted + peaks will be used in the differential analysis. + + groupby: + type: string? + default: null + label: "Metadata column(s) that should be used for datasets grouping" + doc: | + Column(s) from the metadata table to define + datasets grouping. Minimum peakset overlap + threshold will be applied within each group + independently. Union of the resulted peaks + from each of the groups will be used in the + differential analysis. If not provided, + minimum peakset overlap filtering threshold + will be applied for all datasets jointly. + + rpkm_threshold: + type: float? + default: 1 + label: "Minimum allowed RPKM for consensus peaks" + doc: | + Filtering threshold to keep only those consensus + peaks where the maximum RPKM for all datasets is + bigger than or equal to the provided value. + + design_formula: + type: string + label: "Design formula" + doc: | + Design formula comprised of the metadata + columns names. It should start with ~ + + base_levels: + type: string? + default: null + label: "Base levels (optional)" + doc: | + Base levels for each of the metadata columns. + Number and order of the provided values should + correspond to the metadata columns. If not + provided, the defauls base levels will be + defined alphabetically. + + contrast: + type: string? + default: null + label: "Contrast for calculating log2 fold changes" + doc: | + Contrast applied to the analysis results when + calculating log2 fold changes. It should be + formatted as a mathematical formula of values + present in the metadata table. If not provided, + the last term from the design formula + will be used. + + padj_threshold: + type: float? + default: 0.05 + label: "Maximum allowed adjusted P-value for differentially bound sites" + doc: | + Filtering threshold to report only differentially + bound sites with adjusted P-value less than or + equal to the provided value. + + promoter_dist: + type: int? + default: 1000 + label: "Promoter distance, bp" + doc: | + Maximum distance from gene TSS (in both + direction) overlapping which the peak will + be assigned to the promoter region. + 'sd:layout': + advanced: true + + upstream_dist: + type: int? + default: 20000 + label: "Upstream distance, bp" + doc: | + Maximum distance from the promoter (only in + upstream direction) overlapping which the peak + will be assigned to the upstream region. + 'sd:layout': + advanced: true + + cluster_method: + type: + - "null" + - type: enum + symbols: + - "row" + - "column" + - "both" + - "none" + default: "none" + label: "Clustering method" + doc: | + Hierarchical clustering method to be run + on normalized read counts + 'sd:layout': + advanced: true + + row_distance: + type: + - "null" + - type: enum + symbols: + - "cosangle" + - "abscosangle" + - "euclid" + - "abseuclid" + - "cor" + - "abscor" + default: "cosangle" + label: "Distance metric for row clustering" + doc: | + Distance metric for hierarchical row clustering + 'sd:layout': + advanced: true + + column_distance: + type: + - "null" + - type: enum + symbols: + - "cosangle" + - "abscosangle" + - "euclid" + - "abseuclid" + - "cor" + - "abscor" + default: "euclid" + label: "Distance metric for column clustering" + doc: | + Distance metric for hierarchical + column clustering + 'sd:layout': + advanced: true + + center_row: + type: boolean? + default: false + label: "Apply row mean centering before clustering" + doc: | + Apply mean centering for normalized read counts + prior to running clustering by row. Ignored if + clustering method is not set to row or both. + 'sd:layout': + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + - "2" + - "3" + - "4" + default: "1" + label: "Number of cores/cpus to use" + doc: | + Number of cores/cpus to use + 'sd:layout': + advanced: true + + +outputs: + + gc_files: + type: File[] + label: "Genome coverage" + doc: | + Genome coverage files in bigWig format + outputSource: pipe/gc_files + 'sd:visualPlugins': + - igvbrowser: + tab: 'Genome Browser' + id: 'igvbrowser' + type: 'wig' + name: "Genome coverage" + height: 120 + + np_files: + type: + - "null" + - File[] + label: "Called peaks (narrowPeak format)" + doc: | + Called peaks files in narrowPeak format + outputSource: pipe/np_files + 'sd:visualPlugins': + - igvbrowser: + tab: 'Genome Browser' + id: 'igvbrowser' + type: 'annotation' + name: "Called peaks" + displayMode: "COLLAPSE" + height: 40 + + bp_files: + type: + - "null" + - File[] + label: "Called peaks (broadPeak format)" + doc: | + Called peaks files in broadPeak format + outputSource: pipe/bp_files + 'sd:visualPlugins': + - igvbrowser: + tab: 'Genome Browser' + id: 'igvbrowser' + type: 'annotation' + name: "Called peaks" + displayMode: "COLLAPSE" + height: 40 + + diff_sts_bigbed: + type: File + label: "Differentially bound sites (bigBed format)" + doc: | + Differentially bound sites in bigBed format + outputSource: bed_to_bigbed/bigbed_file + 'sd:visualPlugins': + - igvbrowser: + tab: 'Genome Browser' + id: 'igvbrowser' + type: 'annotation' + format: 'bigbed' + name: "Differentially bound sites" + height: 40 + + pk_vrlp_s_plot_png: + type: + - "null" + - type: array + items: File + label: "Peakset overlap rate" + doc: | + Peakset overlap rate + PNG format + outputSource: diffbind/pk_vrlp_s_plot_png + 'sd:visualPlugins': + - image: + tab: 'Exploratory plots' + Caption: 'Peakset overlap rate' + + pk_scr_corr_plot_png: + type: File? + label: "Datasets correlation (peak score)" + doc: | + Datasets correlation (peak score) + PNG format + outputSource: diffbind/pk_scr_corr_plot_png + 'sd:visualPlugins': + - image: + tab: 'Exploratory plots' + Caption: 'Datasets correlation (peak score)' + + rw_rds_corr_plot_png: + type: File? + label: "Datasets correlation (raw reads)" + doc: | + Datasets correlation (raw reads) + PNG format + outputSource: diffbind/rw_rds_corr_plot_png + 'sd:visualPlugins': + - image: + tab: 'Exploratory plots' + Caption: 'Datasets correlation (raw reads)' + + nr_rds_corr_plot_png: + type: File? + label: "Datasets correlation (normalized reads)" + doc: | + Datasets correlation (normalized reads) + PNG format + outputSource: diffbind/nr_rds_corr_plot_png + 'sd:visualPlugins': + - image: + tab: 'Exploratory plots' + Caption: 'Datasets correlation (normalized reads)' + + pk_prfl_plot_png: + type: File? + label: "Peak profiles" + doc: | + Peak profiles + PNG format + outputSource: diffbind/pk_prfl_plot_png + 'sd:visualPlugins': + - image: + tab: 'Differential plots' + Caption: 'Peak profiles' + + diff_vlcn_plot_png: + type: File? + label: "Volcano plot for differentially bound sites" + doc: | + Volcano plot for differentially bound sites + PNG format + outputSource: diffbind/diff_vlcn_plot_png + 'sd:visualPlugins': + - image: + tab: 'Differential plots' + Caption: 'Volcano plot for differentially bound sites' + + diff_ma_plot_png: + type: File? + label: "MA-plot for differentially bound sites" + doc: | + MA-plot for differentially bound sites + PNG format + outputSource: diffbind/diff_ma_plot_png + 'sd:visualPlugins': + - image: + tab: 'Differential plots' + Caption: 'MA-plot for differentially bound sites' + + nr_rds_pca_1_2_plot_png: + type: File? + label: "PCA (1,2) of not filtered normalized counts" + doc: | + PCA (1,2) of not filtered normalized counts + PNG format + outputSource: diffbind/nr_rds_pca_1_2_plot_png + 'sd:visualPlugins': + - image: + tab: 'Exploratory plots' + Caption: 'PCA (1,2) of not filtered normalized counts' + + nr_rds_pca_2_3_plot_png: + type: File? + label: "PCA (2,3) of not filtered normalized counts" + doc: | + PCA (2,3) of not filtered normalized counts + PNG format + outputSource: diffbind/nr_rds_pca_2_3_plot_png + 'sd:visualPlugins': + - image: + tab: 'Exploratory plots' + Caption: 'PCA (2,3) of not filtered normalized counts' + + nr_rds_mds_html: + type: File? + outputSource: diffbind/nr_rds_mds_html + label: "MDS plot of normalized counts" + doc: | + MDS plot of normalized counts. + HTML format + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + diff_sts_tsv: + type: File + label: "Differentially bound sites with assigned nearest genes" + doc: | + Differentially bound sites with assigned nearest genes + TSV format + outputSource: restore_columns/output_file + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'Differentially bound sites' + Title: 'Differentially bound sites' + + diff_sts_labeled_tsv: + type: File + label: "Differentially bound sites with labels" + doc: | + Differentially bound sites with labels + TSV format + outputSource: add_label_column/output_file + + volcano_plot_html_file: + type: File + label: "Volcano Plot" + doc: | + HTML index file for Volcano Plot + outputSource: make_volcano_plot/html_file + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + volcano_plot_html_data: + type: Directory + label: "Directory html data for Volcano Plot" + doc: | + Directory html data for Volcano Plot + outputSource: make_volcano_plot/html_data + + ma_plot_html_file: + type: File + label: "MA-plot" + doc: | + HTML index file for MA-plot + outputSource: make_ma_plot/html_file + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + ma_plot_html_data: + type: Directory + label: "Directory html data for Volcano Plot" + doc: | + Directory html data for MA-plot + outputSource: make_ma_plot/html_data + + heatmap_html: + type: File + label: "Heatmap of normalized counts" + doc: | + Morpheus heatmap in HTML format + outputSource: morpheus_heatmap/heatmap_html + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + nr_rds_gct: + type: File + label: "GCT file with normalized read counts per peak" + doc: | + GCT file with normalized read counts per peak + outputSource: extend_gct/extended_gct + + diffbind_stdout_log: + type: File + label: "DiffBind stdout log" + doc: | + DiffBind stdout log + outputSource: diffbind/stdout_log + + diffbind_stderr_log: + type: File + label: "DiffBind stderr log" + doc: | + DiffBind stderr log + outputSource: diffbind/stderr_log + + morpheus_stdout_log: + type: File + label: "Morpheus heatmap stdout log" + doc: | + Morpheus heatmap stdout log + outputSource: morpheus_heatmap/stdout_log + + morpheus_stderr_log: + type: File + label: "Morpheus heatmap stderr log" + doc: | + Morpheus heatmap stderr log + outputSource: morpheus_heatmap/stderr_log + + +steps: + + pipe: + run: + cwlVersion: v1.0 + class: ExpressionTool + inputs: + genome_coverage_files: + type: File[] + narrow_peak_files: + type: + - "null" + - File[] + broad_peak_files: + type: + - "null" + - File[] + outputs: + gc_files: + type: File[] + np_files: + type: + - "null" + - File[] + bp_files: + type: + - "null" + - File[] + expression: | + ${ + var results = {}; + var output_names = [ + "gc_files", + "np_files", + "bp_files" + ]; + var sources = [ + inputs.genome_coverage_files, + inputs.narrow_peak_files, + inputs.broad_peak_files + ]; + for (var i = 0; i < sources.length; i++){ + var current_source = sources[i]; + var current_output_name = output_names[i]; + results[current_output_name] = null; + if (current_source != null && current_source.length > 0){ + for (var j = 0; j < current_source.length; j++){ + var new_item = current_source[j]; + new_item["basename"] = "u" + "_" + i + "_" + j+ "_" + new_item.basename; + if (results[current_output_name] == null){ + results[current_output_name] = [new_item]; + } else { + results[current_output_name].push(new_item); + } + } + } + } + return results; + } + in: + genome_coverage_files: genome_coverage_files + narrow_peak_files: narrow_peak_files + broad_peak_files: broad_peak_files + out: + - gc_files + - np_files + - bp_files + + diffbind: + run: ../tools/diffbind-multi-factor.cwl + in: + alignment_files: alignment_files + peak_files: peak_files + dataset_names: dataset_names + metadata_file: metadata_file + scoreby: scoreby + score_threshold: score_threshold + rpkm_threshold: rpkm_threshold + overlap_threshold: overlap_threshold + groupby: + source: groupby + valueFrom: $(split_by_common_delim(self)) + design_formula: design_formula + contrast: + source: contrast + valueFrom: $(self==""?null:self) # safety measure + base_levels: + source: base_levels + valueFrom: $(split_by_common_delim(self)) + analysis_method: + default: "deseq2" # hardcoded to always use DESeq2 because EdgeR fails to run without contrast + normalization_method: + default: "auto" # harcoded to auto as we don't allow to use EdgeR + padj_threshold: padj_threshold + cluster_method: + source: cluster_method + valueFrom: $(self=="none"?null:self) + row_distance: row_distance + column_distance: column_distance + center_row: center_row + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - pk_vrlp_s_plot_png + - pk_scr_corr_plot_png + - rw_rds_corr_plot_png + - nr_rds_corr_plot_png + - pk_prfl_plot_png + - diff_vlcn_plot_png + - diff_ma_plot_png + - nr_rds_pca_1_2_plot_png + - nr_rds_pca_2_3_plot_png + - nr_rds_mds_html + - diff_sts_tsv + - nr_rds_gct + - stdout_log + - stderr_log + + filter_columns: + run: ../tools/custom-bash.cwl + in: + input_file: diffbind/diff_sts_tsv + script: + default: > + cat $0 | grep -v "Start" | awk + 'BEGIN {print "chr\tstart\tend\tlength\tabs_summit\tpileup\t-log10(pvalue)\tfold_enrichment\t-log10(qvalue)\tname"} + {print $1"\t"$2"\t"$3"\t"$3-$2+1"\t0\t"NR"\t0\t0\t0\t0"}' > `basename $0` + out: + - output_file + + assign_genes: + run: ../tools/iaintersect.cwl + in: + input_filename: filter_columns/output_file + annotation_filename: annotation_file + promoter_bp: promoter_dist + upstream_bp: upstream_dist + out: + - result_file + + restore_columns: + run: ../tools/custom-bash.cwl + in: + input_file: + - assign_genes/result_file + - diffbind/diff_sts_tsv + script: + default: | + cat $0 | grep -v "start" | sort -k 11n | cut -f 1-5,15 > iaintersect_result.tsv + cat $1 | grep -v "Start" > diffbind_result.tsv + HEADER=`head -n 1 $1`; + echo -e "Refseq_id\tGene_id\ttxStart\ttxEnd\tStrand\tRegion\t${HEADER}" > `basename $0`; + cat iaintersect_result.tsv | paste - diffbind_result.tsv >> `basename $0` + rm iaintersect_result.tsv diffbind_result.tsv + out: + - output_file + + convert_to_bed: + run: ../tools/custom-bash.cwl + in: + input_file: restore_columns/output_file + script: + default: | + cat "$0" | awk -F "\t" 'NR==1 {for (i=1; i<=NF; i++) {ix[$i]=i} } NR>1 {color="255,0,0"; if ($ix["log2FoldChange"]<0) color="0,255,0"; print $ix["Chr"]"\t"$ix["Start"]"\t"$ix["End"]"\tpvalue="$ix["pvalue"]";padj="$ix["padj"]";log2FC="$ix["log2FoldChange"]"\t"1000"\t"$ix["Strand"]"\t"$ix["Start"]"\t"$ix["End"]"\t"color}' > `basename $0` + out: + - output_file + + sort_bed: + run: ../tools/linux-sort.cwl + in: + unsorted_file: convert_to_bed/output_file + key: + default: ["1,1","2,2n"] + out: + - sorted_file + + bed_to_bigbed: + run: ../tools/ucsc-bedtobigbed.cwl + in: + input_bed: sort_bed/sorted_file + bed_type: + default: "bed4+5" + chrom_length_file: chrom_length_file + output_filename: + source: sort_bed/sorted_file + valueFrom: $(self.basename.split('.').slice(0,-1).join('.') + ".bigBed") + out: + - bigbed_file + + add_label_column: + run: ../tools/custom-bash.cwl + in: + input_file: diffbind/diff_sts_tsv + script: + default: | + HEADER=`head -n 1 $0`; + echo -e "label\t${HEADER}" > diff_sts_labeled.tsv; + cat "$0" | grep -v "Start" | awk -F "\t" '{print $1":"$2"-"$3"\t"$0}' >> diff_sts_labeled.tsv + out: + - output_file + + make_volcano_plot: + run: ../tools/volcano-plot.cwl + in: + diff_expr_file: add_label_column/output_file + x_axis_column: + default: "log2FoldChange" + y_axis_column: + default: "padj" + label_column: + default: "label" + out: + - html_data + - html_file + + make_ma_plot: + run: ../tools/ma-plot.cwl + in: + diff_expr_file: add_label_column/output_file + x_axis_column: + default: "baseMean" + y_axis_column: + default: "log2FoldChange" + label_column: + default: "label" + out: + - html_data + - html_file + + extend_gct: + run: + cwlVersion: v1.0 + class: CommandLineTool + hints: + - class: DockerRequirement + dockerPull: biowardrobe2/morpheus:v0.0.2 + - class: InitialWorkDirRequirement + listing: + - entryname: extend.R + entry: | + options(error=function(){traceback(3); quit(save="no", status=1, runLast=FALSE)}) + suppressMessages(library("cmapR")) + suppressMessages(library("dplyr")) + suppressMessages(library("tibble")) + suppressMessages(library("morpheus")) + suppressMessages(library("argparse")) + args = commandArgs(trailingOnly=TRUE) + gct_data <- read.gct(args[1]) + metadata <- read.table(args[2], sep="\t", header=TRUE, check.names=FALSE, stringsAsFactors=FALSE) %>% + mutate(id=paste(Chr, paste(Start, End, sep="-"), sep=":")) %>% + select(id, Gene_id, Region) + row_metadata <- gct_data$rowAnnotations %>% + rownames_to_column("id") %>% + left_join(metadata, by="id") %>% + mutate_at("id", as.vector) + col_metadata <- gct_data$columnAnnotations %>% + rownames_to_column("id") %>% + mutate_at("id", as.vector) + gct_data <- new( + "GCT", + mat=gct_data$data[row_metadata$id, col_metadata$id], + rdesc=row_metadata, + cdesc=col_metadata + ) + write_gct(ds=gct_data, ofile="extended.gct", appenddim=FALSE) + inputs: + input_files: + type: File[] + inputBinding: + position: 5 + outputs: + extended_gct: + type: File + outputBinding: + glob: "extended.gct" + baseCommand: ["Rscript", "extend.R"] + in: + input_files: + - diffbind/nr_rds_gct + - restore_columns/output_file + out: + - extended_gct + + morpheus_heatmap: + run: ../tools/morpheus-heatmap.cwl + in: + read_counts_gct: extend_gct/extended_gct + out: + - heatmap_html + - stdout_log + - stderr_log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "DiffBind Multi-factor Analysis" +s:name: "DiffBind Multi-factor Analysis" +s:alternateName: "Runs DiffBind multi-factor analysis with manual control over major parameters" + + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/diffbind-multi-factor.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: + - class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:michael.kotliar@cchmc.org + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + DiffBind Multi-factor Analysis + ------------------------------ + + DiffBind processes ChIP-Seq data enriched for genomic loci where specific protein/DNA binding occurs, including peak sets identified by ChIP-Seq peak callers and + aligned sequence read datasets. It is designed to work with multiple peak sets simultaneously, representing different ChIP experiments (antibodies, transcription + factor and/or histone marks, experimental conditions, replicates) as well as managing the results of multiple peak callers. + + For more information please refer to: + ------------------------------------- + Ross-Innes CS, Stark R, Teschendorff AE, Holmes KA, Ali HR, Dunning MJ, Brown GD, Gojis O, Ellis IO, Green AR, Ali S, Chin S, Palmieri C, Caldas C, Carroll JS (2012). + “Differential oestrogen receptor binding is associated with clinical outcome in breast cancer.” Nature, 481, -4. \ No newline at end of file From 441fa3d8fbea67e5be498c9a61b1b88c118b93ed Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 14 Mar 2023 13:15:20 -0400 Subject: [PATCH 013/162] Reorder inputs in DiffBind Multi-factor Analysis --- workflows/diffbind-multi-factor.cwl | 155 ++++++++++++++-------------- 1 file changed, 79 insertions(+), 76 deletions(-) diff --git a/workflows/diffbind-multi-factor.cwl b/workflows/diffbind-multi-factor.cwl index 50b67f8f..55ef4055 100644 --- a/workflows/diffbind-multi-factor.cwl +++ b/workflows/diffbind-multi-factor.cwl @@ -59,7 +59,7 @@ inputs: type: string[] label: "ChIP-Seq/ATAC-Seq experiments" doc: | - Unique names for datasets + Unique names for samples 'sd:upstreamSource': "dna_experiment/alias" 'sd:localLabel': true @@ -95,7 +95,7 @@ inputs: type: File label: "Reference genome" doc: | - Genome annotation file in TSV format + Reference genome 'sd:upstreamSource': "genome_indices/annotation" 'sd:localLabel': true @@ -103,39 +103,16 @@ inputs: type: File label: "Reference genome" doc: | - Chromosome length file in txt format + Reference genome 'sd:upstreamSource': "genome_indices/chrom_length" 'sd:localLabel': true - scoreby: - type: - - "null" - - type: enum - symbols: - - "pvalue" - - "qvalue" - default: "pvalue" - label: "Score metrics to exclude low quality peaks" - doc: | - Score metrics to build peak overlap correlation - heatmap and exclude low quality peaks based on - the specific threshold value - - score_threshold: - type: float? - default: 0.05 - label: "Maximum allowed peak score (pvalue/qvalue)" - doc: | - Filtering threshold to keep only those peaks - where the selected metric is less than or equal - to the provided value - metadata_file: type: File - label: "Metadata file to describe datasets categories" + label: "Diff. analysis. Metadata file to describe samples categories" doc: | Metadata file in TSV/CSV format to describe - input datasets categories. First column should + input samples categories. First column should have the name 'sample', all other columns names should be selected from the following list: Tissue, Factor, Condition, Treatment, Caller, @@ -146,49 +123,26 @@ inputs: the other columns. All metadata columns are treated as factors (no covariates are supported). - overlap_threshold: - type: float? - default: 2 - label: "Minimum peakset overlap threshold" - doc: | - Filtering threshold to keep only those peaks - that are present in at least this many datasets - when generating consensus set of peaks used in - differential analysis. If this threshold has a - value between zero and one, only those peaks - will be included that are present in at least - this proportion of datasets. If input datasets - are grouped by the certain metadata columns, - minimum peakset overlap threshold will be first - applied per group, then union of the resulted - peaks will be used in the differential analysis. - groupby: type: string? default: null - label: "Metadata column(s) that should be used for datasets grouping" + label: "Diff. analysis. Metadata column(s) that should be used for samples grouping" doc: | Column(s) from the metadata table to define - datasets grouping. Minimum peakset overlap + samples grouping. Minimum peakset overlap threshold will be applied within each group independently. Union of the resulted peaks from each of the groups will be used in the differential analysis. If not provided, minimum peakset overlap filtering threshold - will be applied for all datasets jointly. - - rpkm_threshold: - type: float? - default: 1 - label: "Minimum allowed RPKM for consensus peaks" - doc: | - Filtering threshold to keep only those consensus - peaks where the maximum RPKM for all datasets is - bigger than or equal to the provided value. + will be applied for all samples jointly. + For grouping by multiple columns provide + space separated values, for example, + 'Treatment Tissue' design_formula: type: string - label: "Design formula" + label: "Diff. analysis. Design formula" doc: | Design formula comprised of the metadata columns names. It should start with ~ @@ -196,7 +150,7 @@ inputs: base_levels: type: string? default: null - label: "Base levels (optional)" + label: "Diff. analysis. Base levels (optional)" doc: | Base levels for each of the metadata columns. Number and order of the provided values should @@ -207,7 +161,7 @@ inputs: contrast: type: string? default: null - label: "Contrast for calculating log2 fold changes" + label: "Diff. analysis. Contrast for calculating log2 fold changes" doc: | Contrast applied to the analysis results when calculating log2 fold changes. It should be @@ -219,16 +173,65 @@ inputs: padj_threshold: type: float? default: 0.05 - label: "Maximum allowed adjusted P-value for differentially bound sites" + label: "Peak selection. Maximum allowed adjusted P-value for differentially bound sites" doc: | Filtering threshold to report only differentially bound sites with adjusted P-value less than or equal to the provided value. + scoreby: + type: + - "null" + - type: enum + symbols: + - "pvalue" + - "qvalue" + default: "pvalue" + label: "Peak selection. Score metrics to exclude low quality peaks" + doc: | + Score metrics to build peak overlap correlation + heatmap and exclude low quality peaks based on + the specific threshold value + + score_threshold: + type: float? + default: 0.05 + label: "Peak selection. Maximum allowed peak score (pvalue/qvalue)" + doc: | + Filtering threshold to keep only those peaks + where the selected metric is less than or equal + to the provided value + + overlap_threshold: + type: float? + default: 2 + label: "Peak selection. Minimum peakset overlap threshold" + doc: | + Filtering threshold to keep only those peaks + that are present in at least this many samples + when generating consensus set of peaks used in + differential analysis. If this threshold has a + value between zero and one, only those peaks + will be included that are present in at least + this proportion of samples. If input samples + are grouped by the certain metadata columns, + minimum peakset overlap threshold will be first + applied per group, then union of the resulted + peaks will be used in the differential analysis. + + rpkm_threshold: + type: float? + default: 1 + label: "Peak selection. Minimum allowed RPKM for consensus peaks" + doc: | + Filtering threshold to keep only those consensus + peaks where the maximum RPKM for all samples is + bigger than or equal to the provided value. + promoter_dist: type: int? default: 1000 - label: "Promoter distance, bp" + label: "Peak annotation. Promoter distance, bp" doc: | Maximum distance from gene TSS (in both direction) overlapping which the peak will @@ -239,7 +242,7 @@ inputs: upstream_dist: type: int? default: 20000 - label: "Upstream distance, bp" + label: "Peak annotation. Upstream distance, bp" doc: | Maximum distance from the promoter (only in upstream direction) overlapping which the peak @@ -257,7 +260,7 @@ inputs: - "both" - "none" default: "none" - label: "Clustering method" + label: "Peak clustering. Clustering method" doc: | Hierarchical clustering method to be run on normalized read counts @@ -276,7 +279,7 @@ inputs: - "cor" - "abscor" default: "cosangle" - label: "Distance metric for row clustering" + label: "Peak clustering. Distance metric for row clustering" doc: | Distance metric for hierarchical row clustering 'sd:layout': @@ -294,7 +297,7 @@ inputs: - "cor" - "abscor" default: "euclid" - label: "Distance metric for column clustering" + label: "Peak clustering. Distance metric for column clustering" doc: | Distance metric for hierarchical column clustering @@ -304,7 +307,7 @@ inputs: center_row: type: boolean? default: false - label: "Apply row mean centering before clustering" + label: "Peak clustering. Apply row mean centering before clustering" doc: | Apply mean centering for normalized read counts prior to running clustering by row. Ignored if @@ -411,39 +414,39 @@ outputs: pk_scr_corr_plot_png: type: File? - label: "Datasets correlation (peak score)" + label: "Samples correlation (peak score)" doc: | - Datasets correlation (peak score) + Samples correlation (peak score) PNG format outputSource: diffbind/pk_scr_corr_plot_png 'sd:visualPlugins': - image: tab: 'Exploratory plots' - Caption: 'Datasets correlation (peak score)' + Caption: 'Samples correlation (peak score)' rw_rds_corr_plot_png: type: File? - label: "Datasets correlation (raw reads)" + label: "Samples correlation (raw reads)" doc: | - Datasets correlation (raw reads) + Samples correlation (raw reads) PNG format outputSource: diffbind/rw_rds_corr_plot_png 'sd:visualPlugins': - image: tab: 'Exploratory plots' - Caption: 'Datasets correlation (raw reads)' + Caption: 'Samples correlation (raw reads)' nr_rds_corr_plot_png: type: File? - label: "Datasets correlation (normalized reads)" + label: "Samples correlation (normalized reads)" doc: | - Datasets correlation (normalized reads) + Samples correlation (normalized reads) PNG format outputSource: diffbind/nr_rds_corr_plot_png 'sd:visualPlugins': - image: tab: 'Exploratory plots' - Caption: 'Datasets correlation (normalized reads)' + Caption: 'Samples correlation (normalized reads)' pk_prfl_plot_png: type: File? From a345c9ad694a574c42f6bc5bc82fed4e124a1a26 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 14 Mar 2023 14:22:19 -0400 Subject: [PATCH 014/162] Update doc field in the design formula input --- workflows/diffbind-multi-factor.cwl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/workflows/diffbind-multi-factor.cwl b/workflows/diffbind-multi-factor.cwl index 55ef4055..4a412172 100644 --- a/workflows/diffbind-multi-factor.cwl +++ b/workflows/diffbind-multi-factor.cwl @@ -145,7 +145,10 @@ inputs: label: "Diff. analysis. Design formula" doc: | Design formula comprised of the metadata - columns names. It should start with ~ + columns names. For example, to model the + effect of Treatment, Tissue, and their + interaction use + ~Treatment%2BTissue%2BTreatment%3ATissue base_levels: type: string? From 1879e0ee432c7c10cf4469fede9945c238102649 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 29 Mar 2023 15:06:35 -0400 Subject: [PATCH 015/162] Updated wrong label in DiffBind Multi-Factor workflow --- workflows/diffbind-multi-factor.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/diffbind-multi-factor.cwl b/workflows/diffbind-multi-factor.cwl index 4a412172..407dd07d 100644 --- a/workflows/diffbind-multi-factor.cwl +++ b/workflows/diffbind-multi-factor.cwl @@ -176,7 +176,7 @@ inputs: padj_threshold: type: float? default: 0.05 - label: "Peak selection. Maximum allowed adjusted P-value for differentially bound sites" + label: "Diff. analysis. Maximum allowed adjusted P-value for differentially bound sites" doc: | Filtering threshold to report only differentially bound sites with adjusted P-value less than or From c65f6fcc0bd9781e3677a1bdecace626adb3dea0 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 29 Mar 2023 16:14:18 -0400 Subject: [PATCH 016/162] Add updated set of single-cell pipelines --- tools/cellbrowser-build-cellranger-arc.cwl | 15 +- tools/cellbrowser-build-cellranger.cwl | 16 +- tools/cellranger-aggr.cwl | 262 ++- tools/cellranger-arc-aggr.cwl | 35 +- tools/cellranger-arc-count.cwl | 32 +- tools/cellranger-arc-mkref.cwl | 24 +- tools/cellranger-count.cwl | 210 +- tools/cellranger-mkref.cwl | 21 +- tools/cellranger-mkvdjref.cwl | 176 ++ tools/cellranger-multi.cwl | 564 ++++++ tools/cellranger-reanalyze.cwl | 166 +- tools/collect-stats-sc-arc-count.cwl | 116 ++ tools/collect-stats-sc-count.cwl | 116 ++ tools/extract-7z.cwl | 127 ++ tools/sc-atac-cluster.cwl | 644 ++++++ tools/sc-atac-reduce.cwl | 599 ++++++ tools/sc-ctype-assign.cwl | 1005 ++++++++++ tools/sc-multiome-filter.cwl | 1735 +++++++++++++++++ tools/sc-rna-cluster.cwl | 785 ++++++++ tools/sc-rna-da-cells.cwl | 602 ++++++ tools/sc-rna-de-pseudobulk.cwl | 743 +++++++ tools/sc-rna-filter.cwl | 864 ++++++++ tools/sc-rna-reduce.cwl | 862 ++++++++ tools/sc-triangulate.cwl | 442 +++++ tools/sc-wnn-cluster.cwl | 993 ++++++++++ tools/tar-compress.cwl | 12 +- tools/tar-extract.cwl | 84 + workflows/cellranger-aggr.cwl | 179 +- workflows/cellranger-arc-aggr.cwl | 11 +- workflows/cellranger-arc-count.cwl | 100 +- workflows/cellranger-mkref.cwl | 2 +- workflows/cellranger-mkvdjref.cwl | 144 ++ workflows/cellranger-multi.cwl | 676 +++++++ workflows/cellranger-reanalyze.cwl | 79 +- workflows/sc-atac-cluster.cwl | 519 +++++ workflows/sc-atac-reduce.cwl | 466 +++++ workflows/sc-ctype-assign.cwl | 823 ++++++++ workflows/sc-multiome-filter.cwl | 1405 +++++++++++++ workflows/sc-rna-cluster.cwl | 614 ++++++ workflows/sc-rna-da-cells.cwl | 473 +++++ workflows/sc-rna-de-pseudobulk.cwl | 745 +++++++ workflows/sc-rna-filter.cwl | 712 +++++++ workflows/sc-rna-reduce.cwl | 674 +++++++ workflows/sc-triangulate.cwl | 403 ++++ workflows/sc-wnn-cluster.cwl | 770 ++++++++ .../single-cell-preprocess-cellranger.cwl | 147 +- 46 files changed, 19676 insertions(+), 516 deletions(-) create mode 100644 tools/cellranger-mkvdjref.cwl create mode 100644 tools/cellranger-multi.cwl create mode 100644 tools/collect-stats-sc-arc-count.cwl create mode 100644 tools/collect-stats-sc-count.cwl create mode 100644 tools/extract-7z.cwl create mode 100644 tools/sc-atac-cluster.cwl create mode 100644 tools/sc-atac-reduce.cwl create mode 100644 tools/sc-ctype-assign.cwl create mode 100644 tools/sc-multiome-filter.cwl create mode 100644 tools/sc-rna-cluster.cwl create mode 100644 tools/sc-rna-da-cells.cwl create mode 100644 tools/sc-rna-de-pseudobulk.cwl create mode 100644 tools/sc-rna-filter.cwl create mode 100644 tools/sc-rna-reduce.cwl create mode 100644 tools/sc-triangulate.cwl create mode 100644 tools/sc-wnn-cluster.cwl create mode 100644 tools/tar-extract.cwl create mode 100644 workflows/cellranger-mkvdjref.cwl create mode 100644 workflows/cellranger-multi.cwl create mode 100644 workflows/sc-atac-cluster.cwl create mode 100644 workflows/sc-atac-reduce.cwl create mode 100644 workflows/sc-ctype-assign.cwl create mode 100644 workflows/sc-multiome-filter.cwl create mode 100644 workflows/sc-rna-cluster.cwl create mode 100644 workflows/sc-rna-da-cells.cwl create mode 100644 workflows/sc-rna-de-pseudobulk.cwl create mode 100644 workflows/sc-rna-filter.cwl create mode 100644 workflows/sc-rna-reduce.cwl create mode 100644 workflows/sc-triangulate.cwl create mode 100644 workflows/sc-wnn-cluster.cwl diff --git a/tools/cellbrowser-build-cellranger-arc.cwl b/tools/cellbrowser-build-cellranger-arc.cwl index e66d847c..90632cb2 100644 --- a/tools/cellbrowser-build-cellranger-arc.cwl +++ b/tools/cellbrowser-build-cellranger-arc.cwl @@ -227,7 +227,12 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -s:name: "cellbrowser-build-cellranger-arc" +label: "Cell Ranger ARC Count/Aggregate to UCSC Cell Browser" +s:name: "Cell Ranger ARC Count/Aggregate to UCSC Cell Browser" +s:alternateName: | + Exports clustering results from Cell Ranger ARC Count Chromatin Accessibility and Gene Expression or + Cell Ranger ARC Aggregate experiments into compatible with UCSC Cell Browser format + s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellbrowser-build-cellranger-arc.cwl s:codeRepository: https://github.com/Barski-lab/workflows s:license: http://www.apache.org/licenses/LICENSE-2.0 @@ -264,7 +269,13 @@ s:creator: doc: | - Converts Cellranger ARC outputs into the data structure supported by UCSC CellBrowser + Cell Ranger ARC Count/Aggregate to UCSC Cell Browser + ===================================================== + + Exports clustering results from Cell Ranger ARC Count + Chromatin Accessibility and Gene Expression or Cell + Ranger ARC Aggregate experiments into compatible with + UCSC Cell Browser format s:about: | diff --git a/tools/cellbrowser-build-cellranger.cwl b/tools/cellbrowser-build-cellranger.cwl index b9bf3708..01a414bd 100644 --- a/tools/cellbrowser-build-cellranger.cwl +++ b/tools/cellbrowser-build-cellranger.cwl @@ -57,6 +57,8 @@ inputs: mkdir -p ./cellbrowser_input/analysis ./cellbrowser_input/filtered_feature_bc_matrix cp -r $0/* ./cellbrowser_input/analysis/ cp -r $1/* ./cellbrowser_input/filtered_feature_bc_matrix/ + echo "Removing gene_expression_ part from all of the folder names in analysis" + du -a ./cellbrowser_input/analysis | cut -f 2 | grep gene_expression | xargs -I{} bash -c 'mv "$1" "${1//gene_expression_/}"' -- {} echo "Run cbImportCellranger" cbImportCellranger -i cellbrowser_input -o cellbrowser_output --name cellbrowser cd ./cellbrowser_output @@ -139,7 +141,12 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -s:name: "cellbrowser-build-cellranger" +label: "Cell Ranger Count/Aggregate to UCSC Cell Browser" +s:name: "Cell Ranger Count/Aggregate to UCSC Cell Browser" +s:alternateName: | + Exports clustering results from Cell Ranger Count Gene Expression or Cell Ranger + Aggregate experiments into compatible with UCSC Cell Browser format + s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellbrowser-build-cellranger.cwl s:codeRepository: https://github.com/Barski-lab/workflows s:license: http://www.apache.org/licenses/LICENSE-2.0 @@ -176,7 +183,12 @@ s:creator: doc: | - Converts Cellranger outputs into the data structure supported by UCSC CellBrowser + Cell Ranger Count/Aggregate to UCSC Cell Browser + ================================================================= + + Exports clustering results from Cell Ranger Count Gene Expression + and Cell Ranger Aggregate experiments into compatible with UCSC + Cell Browser format. s:about: | diff --git a/tools/cellranger-aggr.cwl b/tools/cellranger-aggr.cwl index 6015183a..e665ec31 100644 --- a/tools/cellranger-aggr.cwl +++ b/tools/cellranger-aggr.cwl @@ -5,52 +5,88 @@ class: CommandLineTool requirements: - class: InlineJavascriptRequirement expressionLib: - - var get_label = function(i) { - var rootname = inputs.molecule_info_h5[i].basename.split('.').slice(0,-1).join('.'); - rootname = (rootname=="")?inputs.molecule_info_h5[i].basename:rootname; - return inputs.gem_well_labels?inputs.gem_well_labels[i].replace(/,/g, "_"):rootname; + - var get_label = function(input_array, i) { + var rootname = input_array[i].basename.split('.').slice(0,-1).join('.'); + rootname = (rootname=="")?input_array[i].basename:rootname; + return inputs.gem_well_labels?inputs.gem_well_labels[i].replace(/\t|\s|\[|\]|\>|\<|,|\./g, "_"):rootname; }; - class: InitialWorkDirRequirement listing: | ${ - var entry = "library_id,molecule_h5\n" - for (var i=0; i < inputs.molecule_info_h5.length; i++){ - entry += get_label(i) + "," + inputs.molecule_info_h5[i].path + "\n" + if (inputs.molecule_info_h5 != null){ + var entry = "sample_id,molecule_h5\n" + for (var i=0; i < inputs.molecule_info_h5.length; i++){ + entry += get_label(inputs.molecule_info_h5, i) + "," + inputs.molecule_info_h5[i].path + "\n" + } + } else if (inputs.filtered_data_folder != null){ + var entry = "sample_id,sample_outs,donor,origin\n" + for (var i=0; i < inputs.filtered_data_folder.length; i++){ + var donor = "donor" + var origin = "origin" + if (inputs.clonotype_grouping == "same_donor_different_origins"){ + origin = "origin_" + i + } else if (inputs.clonotype_grouping == "different_donors"){ + donor = "donor_" + i + origin = "origin_" + i + } + entry += get_label(inputs.filtered_data_folder, i) + "," + inputs.filtered_data_folder[i].path + "," + donor + "," + origin + "\n" + } + } else { + var entry = "neither molecule_info_h5 nor filtered_data_folder was provided" } return [{ "entry": entry, - "entryname": "metadata.csv" + "entryname": "metadata.csv", + "writable": true }]; } hints: - class: DockerRequirement - dockerPull: cumulusprod/cellranger:4.0.0 + dockerPull: cumulusprod/cellranger:7.0.0 inputs: molecule_info_h5: - type: File[] + type: + - "null" + - File[] doc: | Array of molecule-level information files in HDF5 format. - Outputs from "cellranger count" command - + Outputs from "cellranger count" command. Either + molecule_info_h5 or filtered_data_folder should be + provided. If both inputs are provided - use molecule_info_h5. + + filtered_data_folder: + type: + - "null" + - Directory[] + doc: | + Array of folders containing filtered data, i.e., only + cell-associated barcodes. Outputs from "cellranger multi" + command. Either molecule_info_h5 or filtered_data_folder should + be provided. If both inputs are provided - use molecule_info_h5. + gem_well_labels: type: - "null" - string[] doc: | Array of GEM well identifiers to be used for labeling purposes only. - If not provided use rootnames of files from the molecule_info_h5 input + If not provided use rootnames of files from the molecule_info_h5 or + directories from filtered_data_folder inputs. If labels are not + unique, cellranger will fails. normalization_mode: type: - "null" - type: enum name: "normalization" - symbols: ["none", "mapped"] + symbols: + - "none" + - "mapped" inputBinding: position: 5 prefix: "--normalize" @@ -58,6 +94,35 @@ inputs: Library depth normalization mode: mapped, none. Default: mapped + clonotype_grouping: + type: + - "null" + - type: enum + name: "clonotype_grouping" + symbols: + - "same_donor_different_origins" + - "same_donor_and_origin" + - "different_donors" + default: "different_donors" + doc: | + When cellranger aggr is called with cellranger multi outputs, there are three + ways it can process the datasets depending on the combination of donor and + origin values: + 1. If two datasets come from the same donor but have different origins, Cell Ranger + will rerun the clonotype grouping algorithm on the combined set of cells. This + allows cells from different datasets to belong to the same clonotype. + 2. If two datasets come from the same donor and origin, then Cell Ranger performs + additional filtering to remove certain rare artifacts. For example, Cell Ranger + will filter expanded exact subclonotypes that are present in one library but not + in another from the same origin, which would be highly improbable, assuming random + draws of cells from the tube. These are believed to arise when a plasma or + plasmablast cell breaks up during or after pipetting from the tube, and the resulting + fragments contaminate GEMs, yielding expanded false clonotypes that are residues of + real single plasma cells. + 3. If two cells came from different donors, then Cell Ranger will not put them in the + same clonotype. + Ignored if cellranger aggr is run with molecule_info_h5 inputs. + threads: type: int? inputBinding: @@ -98,59 +163,84 @@ outputs: metrics_summary_report_json: type: File outputBinding: - glob: "aggregated/outs/summary.json" + glob: "aggregated/outs/count/summary.json" doc: | - Aggregated run summary metrics in JSON format + Aggregated GEX run summary metrics in JSON format secondary_analysis_report_folder: type: Directory outputBinding: - glob: "aggregated/outs/analysis" + glob: "aggregated/outs/count/analysis" doc: | - Folder with secondary analysis results including dimensionality reduction, - cell clustering, and differential expression for aggregated results + Folder with secondary analysis of GEX data including dimensionality reduction, + cell clustering, and differential expression filtered_feature_bc_matrix_folder: type: Directory outputBinding: - glob: "aggregated/outs/filtered_feature_bc_matrix" + glob: "aggregated/outs/count/filtered_feature_bc_matrix" doc: | - Folder with aggregated filtered feature-barcode matrices containing only cellular barcodes in MEX format + Folder with aggregated filtered feature-barcode matrices + containing only cellular barcodes in MEX format filtered_feature_bc_matrix_h5: type: File outputBinding: - glob: "aggregated/outs/filtered_feature_bc_matrix.h5" + glob: "aggregated/outs/count/filtered_feature_bc_matrix.h5" doc: | - Aggregated filtered feature-barcode matrices containing only cellular barcodes in HDF5 format + Filtered feature-barcode matrices containing only cellular + barcodes in HDF5 format. - raw_feature_bc_matrices_folder: - type: Directory + aggregation_metadata: + type: File outputBinding: - glob: "aggregated/outs/raw_feature_bc_matrix" + glob: "aggregated/outs/aggregation.csv" doc: | - Folder with aggregated unfiltered feature-barcode matrices containing all barcodes in MEX format + Copy of the input aggregation CSV file - raw_feature_bc_matrices_h5: + loupe_browser_track: type: File outputBinding: - glob: "aggregated/outs/raw_feature_bc_matrix.h5" + glob: "aggregated/outs/count/cloupe.cloupe" doc: | - Aggregated unfiltered feature-barcode matrices containing all barcodes in HDF5 format + Loupe Browser visualization and analysis file - aggregation_metadata: - type: File + clonotypes_csv: + type: File? outputBinding: - glob: "aggregated/outs/aggregation.csv" + glob: "aggregated/outs/vdj_*/clonotypes.csv" doc: | - Copy of the input aggregation CSV file + CSV file with high-level descriptions of each clonotype - loupe_browser_track: - type: File + consensus_sequences_fasta: + type: File? + outputBinding: + glob: "aggregated/outs/vdj_*/consensus.fasta" + doc: | + The consensus sequence of each assembled contig. + + consensus_annotations_csv: + type: File? outputBinding: - glob: "aggregated/outs/cloupe.cloupe" + glob: "aggregated/outs/vdj_*/consensus_annotations.csv" doc: | - Loupe Browser visualization and analysis file for aggregated results + CSV file with high-level and detailed annotations of each clonotype + consensus sequence. + + filtered_contig_annotations_csv: + type: File? + outputBinding: + glob: "aggregated/outs/vdj_*/filtered_contig_annotations.csv" + doc: | + CSV file with high-level annotations of each high-confidence contig from + cell-associated barcodes + + loupe_vdj_browser_track: + type: File? + outputBinding: + glob: "aggregated/outs/vdj_*/vloupe.vloupe" + doc: | + Loupe V(D)J Browser visualization and analysis file stdout_log: type: stdout @@ -172,9 +262,11 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cellranger aggr - aggregates data from multiple Cellranger runs" -s:name: "Cellranger aggr - aggregates data from multiple Cellranger runs" -s:alternateName: "Cellranger aggr takes a list of cellranger count output files and produces a single feature-barcode matrix containing all the data" +label: "Cell Ranger Aggregate" +s:name: "Cell Ranger Aggregate" +s:alternateName: | + Aggregates outputs from multiple runs of Cell Ranger Count Gene Expression or + Cell Ranger Multi Gene Expression and V(D)J Repertoire Profiling experiments s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-aggr.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -212,31 +304,19 @@ s:creator: doc: | - Tool calls "cellranger aggr" command to combine output files from "cellranger count" - (the molecule_info.h5 file from each run) into a single feature-barcode matrix containing - all the data. When combining multiple GEM wells, the barcode sequences for each channel - are distinguished by a GEM well suffix appended to the barcode sequence. Each GEM well is - a physically distinct set of GEM partitions, but draws barcode sequences randomly from the - pool of valid barcodes, known as the barcode whitelist. To keep the barcodes unique when - aggregating multiple libraries, we append a small integer identifying the GEM well to the - barcode nucleotide sequence, and use that nucleotide sequence plus ID as the unique identifier - in the feature-barcode matrix. For example, AGACCATTGAGACTTA-1 and AGACCATTGAGACTTA-2 are - distinct cell barcodes from different GEM wells, despite having the same barcode nucleotide - sequence. This number, which tells us which GEM well this barcode sequence came from, is - called the GEM well suffix. The numbering of the GEM wells will reflect the order that the - GEM wells were provided in the "molecule_info_h5" and "gem_well_labels" inputs. - - When combining data from multiple GEM wells, the "cellranger aggr" pipeline automatically - equalizes the average read depth per cell between groups before merging. This approach avoids - artifacts that may be introduced due to differences in sequencing depth. It is possible to turn - off normalization or change the way normalization is done through the "normalization_mode" - input. The "none" value may be appropriate if you want to maximize sensitivity and plan to deal - with depth normalization in a downstream step. + Cell Ranger Aggregate + + Aggregates outputs from multiple runs of Cell Ranger Count Gene + Expression (if molecule_info_h5 input provided) or Cell Ranger + Multi Gene Expression and V(D)J Repertoire Profiling experiments + (if filtered_data_folder input provided). If both inputs are + provided - use molecule_info_h5. If neither of them was provided + cellranger aggr will fail. Parameters set by default: --disable-ui - no need in any UI when running in Docker container - --id - hardcoded to `aggregated` as we want to return the content of the - outputs folder as separate outputs + --id - hardcoded to `aggregated` as we want to return the content + of the outputs folder as separate outputs Skipped parameters: --nosecondary @@ -251,38 +331,38 @@ doc: | --overrides --uiport - Not supported features: - - Batch correction caused by different versions of the Single Cell Gene Expression chemistry is - not supported as the generated metadata file doesn't include "batch" field. + Not supported features when aggregating GEX experiments: + - Batch correction caused by different versions of the Single Cell Gene + Expression chemistry is not supported as the generated metadata file + for merging molecule_info_h5 inputs doesn't include "batch" field. + s:about: | Aggregate data from multiple Cell Ranger runs USAGE: - cellranger aggr [FLAGS] [OPTIONS] --id --csv - - FLAGS: - --nosecondary Disable secondary analysis, e.g. clustering - --dry Do not execute the pipeline. Generate a pipeline invocation (.mro) file and stop - --disable-ui Do not serve the UI - --noexit Keep web UI running after pipestance completes or fails - --nopreflight Skip preflight checks - -h, --help Prints help information + cellranger aggr [OPTIONS] --id --csv OPTIONS: - --id A unique run id and output folder name [a-zA-Z0-9_-]+ - --description Sample description to embed in output files [default: ] - --csv Path of CSV file enumerating 'cellranger count' outputs - --normalize Library depth normalization mode [default: mapped] [possible values: mapped, none] - --jobmode Job manager to use. Valid options: local (default), sge, lsf, slurm or a .template file. Search for help on "Cluster Mode" at - support.10xgenomics.com for more details on configuring the pipeline to use a compute cluster [default: local] - --localcores Set max cores the pipeline may request at one time. Only applies to local jobs - --localmem Set max GB the pipeline may request at one time. Only applies to local jobs - --localvmem Set max virtual address space in GB for the pipeline. Only applies to local jobs - --mempercore Reserve enough threads for each job to ensure enough memory will be available, assuming each core on your cluster has at least this much memory - available. Only applies in cluster jobmodes - --maxjobs Set max jobs submitted to cluster at one time. Only applies in cluster jobmodes - --jobinterval Set delay between submitting jobs to cluster, in ms. Only applies in cluster jobmodes - --overrides The path to a JSON file that specifies stage-level overrides for cores and memory. Finer-grained than --localcores, --mempercore and --localmem. - Consult the 10x support website for an example override file - --uiport Serve web UI at http://localhost:PORT \ No newline at end of file + --id A unique run id and output folder name [a-zA-Z0-9_-]+ + --description Sample description to embed in output files [default: ] + --csv Path of CSV file enumerating 'cellranger count/vdj/multi' outputs + --normalize Library depth normalization mode [default: mapped] [possible values: mapped, none] + --nosecondary Disable secondary analysis, e.g. clustering + --dry Do not execute the pipeline. Generate a pipeline invocation (.mro) file and stop + --jobmode Job manager to use. Valid options: local (default), sge, lsf, slurm or path to a .template file. Search for help on "Cluster Mode" at + support.10xgenomics.com for more details on configuring the pipeline to use a compute cluster [default: local] + --localcores Set max cores the pipeline may request at one time. Only applies to local jobs + --localmem Set max GB the pipeline may request at one time. Only applies to local jobs + --localvmem Set max virtual address space in GB for the pipeline. Only applies to local jobs + --mempercore Reserve enough threads for each job to ensure enough memory will be available, assuming each core on your cluster has at least this much memory + available. Only applies to cluster jobmodes + --maxjobs Set max jobs submitted to cluster at one time. Only applies to cluster jobmodes + --jobinterval Set delay between submitting jobs to cluster, in ms. Only applies to cluster jobmodes + --overrides The path to a JSON file that specifies stage-level overrides for cores and memory. Finer-grained than --localcores, --mempercore and --localmem. + Consult https://support.10xgenomics.com/ for an example override file + --uiport Serve web UI at http://localhost:PORT + --disable-ui Do not serve the web UI + --noexit Keep web UI running after pipestance completes or fails + --nopreflight Skip preflight checks + -h, --help Print help information \ No newline at end of file diff --git a/tools/cellranger-arc-aggr.cwl b/tools/cellranger-arc-aggr.cwl index 6cd7eafa..f0aa87d8 100644 --- a/tools/cellranger-arc-aggr.cwl +++ b/tools/cellranger-arc-aggr.cwl @@ -15,7 +15,7 @@ requirements: ${ var entry = "library_id,atac_fragments,per_barcode_metrics,gex_molecule_info\n" for (var i=0; i < inputs.gex_molecule_info_h5.length; i++){ - entry += get_label(i) + "," + inputs.atac_fragments_file[i].path + "," + inputs.barcode_metrics_report[i].path + "," + inputs.gex_molecule_info_h5[i].path + "\n" + entry += get_label(i) + "," + inputs.atac_fragments_file_from_count[i].path + "," + inputs.barcode_metrics_report[i].path + "," + inputs.gex_molecule_info_h5[i].path + "\n" } return [{ "entry": entry, @@ -26,12 +26,12 @@ requirements: hints: - class: DockerRequirement - dockerPull: cumulusprod/cellranger-arc:2.0.0 + dockerPull: cumulusprod/cellranger-arc:2.0.2 inputs: - atac_fragments_file: + atac_fragments_file_from_count: type: File[] secondaryFiles: - .tbi @@ -229,9 +229,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cellranger ARC aggr - aggregates data from multiple Cellranger ARC runs" -s:name: "Cellranger ARC aggr - aggregates data from multiple Cellranger ARC runs" -s:alternateName: "Cellranger ARC aggr takes a list of cellranger ARC count output files and produces a single feature-barcode matrix containing all the data" +label: "Cellranger ARC Aggregate" +s:name: "Cellranger ARC Aggregate" +s:alternateName: "Aggregates outputs from multiple runs of Cell Ranger ARC Count Chromatin Accessibility and Gene Expression" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-arc-aggr.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -269,16 +269,11 @@ s:creator: doc: | + Cellranger ARC Aggregate + ======================================================================== - Tool calls "cellranger-arc aggr" command that takes as input a CSV file specifying a list - of cellranger-arc count output files for each GEM well being aggregated and produces a - single feature-barcode matrix containing all the data. When combining multiple GEM wells, - the barcode sequences for each channel are distinguished by a GEM well suffix appended to - the barcode sequence. By default, the reads from each GEM well are subsampled such that all - GEM wells have the same effective sequencing depth for both ATAC and gene expression modalities; - for the ATAC data it is measured in terms of median unique fragments per cell and for gene - expression it is measured in terms of the average number of reads that are confidently mapped - to the transcriptome per cell. However, it is possible to turn off this normalization altogether. + Aggregates outputs from multiple runs of Cell Ranger ARC Count Chromatin + Accessibility and Gene Expression. Parameters set by default: --disable-ui - no need in any UI when running in Docker container @@ -299,6 +294,16 @@ doc: | --overrides --uiport + Tool calls "cellranger-arc aggr" command that takes as input a CSV file specifying a list + of cellranger-arc count output files for each GEM well being aggregated and produces a + single feature-barcode matrix containing all the data. When combining multiple GEM wells, + the barcode sequences for each channel are distinguished by a GEM well suffix appended to + the barcode sequence. By default, the reads from each GEM well are subsampled such that all + GEM wells have the same effective sequencing depth for both ATAC and gene expression modalities; + for the ATAC data it is measured in terms of median unique fragments per cell and for gene + expression it is measured in terms of the average number of reads that are confidently mapped + to the transcriptome per cell. However, it is possible to turn off this normalization altogether. + s:about: | Aggregate data from multiple `cellranger-arc count` runs diff --git a/tools/cellranger-arc-count.cwl b/tools/cellranger-arc-count.cwl index 1a25415f..d302012a 100644 --- a/tools/cellranger-arc-count.cwl +++ b/tools/cellranger-arc-count.cwl @@ -73,7 +73,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: cumulusprod/cellranger-arc:2.0.0 + dockerPull: cumulusprod/cellranger-arc:2.0.2 inputs: @@ -369,9 +369,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cell Ranger ARC count - generates single cell feature counts for a single multiome library" -s:name: "Cell Ranger ARC count - generates single cell feature counts for a single multiome library" -s:alternateName: "Counts ATAC and gene expression reads from a single 10x Genomics Cell Ranger Multiome ATAC + Gene Expression library" +label: "Cell Ranger ARC Count Chromatin Accessibility and Gene Expression" +s:name: "Cell Ranger ARC Count Chromatin Accessibility and Gene Expression" +s:alternateName: "Quantifies chromatin accessibility and gene expression from a single-cell Multiome ATAC/RNA-Seq library" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-arc-count.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -409,20 +409,17 @@ s:creator: doc: | - Count ATAC and gene expression reads from a single library. + Cell Ranger ARC Count Chromatin Accessibility and Gene Expression + ================================================================= - Cell Ranger ARC count performs alignment, filtering, barcode counting, - peak calling and counting of both ATAC and GEX molecules. Furthermore, - it uses the Chromium cellular barcodes to generate feature-barcode matrices, - perform dimensionality reduction, determine clusters, perform differential - analysis on clusters and identify linkages between peaks and genes. The - count pipeline can take input from multiple sequencing runs on the same - GEM well. + Quantifies chromatin accessibility and gene expression from a + single-cell Multiome ATAC/RNA-Seq library. Parameters set by default: --disable-ui - no need in any UI when running in Docker container --id - hardcoded to `sample` to simplify output files location - --libraries - points to the file libraries.csv generated based on the input FASTQ files + --libraries - points to the file libraries.csv generated based on + the input FASTQ files No implemented parameters: --no-bam - we want to always generate BAM files @@ -441,6 +438,15 @@ doc: | https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/using/using/fastq-input + Cell Ranger ARC count performs alignment, filtering, barcode counting, + peak calling and counting of both ATAC and GEX molecules. Furthermore, + it uses the Chromium cellular barcodes to generate feature-barcode matrices, + perform dimensionality reduction, determine clusters, perform differential + analysis on clusters and identify linkages between peaks and genes. The + count pipeline can take input from multiple sequencing runs on the same + GEM well. + + s:about: | Count ATAC and gene expression reads from a single library diff --git a/tools/cellranger-arc-mkref.cwl b/tools/cellranger-arc-mkref.cwl index f77fc584..d5406394 100644 --- a/tools/cellranger-arc-mkref.cwl +++ b/tools/cellranger-arc-mkref.cwl @@ -36,7 +36,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: cumulusprod/cellranger-arc:2.0.0 + dockerPull: cumulusprod/cellranger-arc:2.0.2 inputs: @@ -116,9 +116,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cell Ranger ARC mkref - builds compatible with Cell Ranger ARC indices" -s:name: "Cell Ranger ARC mkref - builds compatible with Cell Ranger ARC indices" -s:alternateName: "Builds compatible with Cell Ranger ARC reference folder from user-supplied genome FASTA and gene GTF files" +label: "Cell Ranger ARC Build Reference Indices" +s:name: "Cell Ranger ARC Build Reference Indices" +s:alternateName: "Builds Cell Ranger ARC compatible reference folder from the custom genome FASTA and gene GTF annotation files" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-arc-mkref.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -156,12 +156,18 @@ s:creator: doc: | - Reference preparation tool for 10x Genomics Cell Ranger Multiome ATAC + Gene Expression - + Cell Ranger ARC Build Reference Indices + ==================================================================== + + Builds Cell Ranger ARC compatible reference folder from the custom + genome FASTA and gene GTF annotation files. + Notes: - - `input_motifs` parameter in the `config.txt` file is not implemented - - if GTF file provided in `annotation_gtf_file` has duplicate gene_id, they should be - grouped together. Applicable to to USCS RefGene annotations. + - `input_motifs` parameter in the `config.txt` file is not + implemented. + - if GTF file provided in `annotation_gtf_file` has records with + duplicate gene_id, they should be grouped together. Applicable to + USCS RefGene annotations. s:about: | diff --git a/tools/cellranger-count.cwl b/tools/cellranger-count.cwl index 0b5b4c27..117ae521 100644 --- a/tools/cellranger-count.cwl +++ b/tools/cellranger-count.cwl @@ -34,7 +34,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: cumulusprod/cellranger:4.0.0 + dockerPull: cumulusprod/cellranger:7.0.0 inputs: @@ -63,39 +63,74 @@ inputs: Path of folder containing 10x-compatible transcriptome reference. Should be generated by "cellranger mkref" command - expect_cells: + r1_length: type: int? inputBinding: position: 6 + prefix: "--r1-length" + doc: | + Limit the length of the input Read 1 sequence of Gene Expression library + to the first N bases, where N is a user-supplied value. Note that the length + includes the 10x Barcode and UMI sequences so do not set this below 26 for + Single Cell 3′ v2 or Single Cell 5′. This and --r2-length are useful options + for determining the optimal read length for sequencing. + + r2_length: + type: int? + inputBinding: + position: 7 + prefix: "--r2-length" + doc: | + Limit the length of the input R2 sequence to the first N bases, where N is a + user-supplied value. Trimming occurs before sequencing metrics are computed + and therefore, limiting R2 read length may affect Q30 scores. + + expect_cells: + type: int? + inputBinding: + position: 8 prefix: "--expect-cells" doc: | Expected number of recovered cells. - Default: 3,000 cells + Starting in Cell Ranger 7.0, the expected number of cells can be either auto-estimated + or specified with --expect-cells. To replicate an old cellranger count analysis, set + this parameter to 3,000 cells. force_cells: type: int? inputBinding: - position: 7 + position: 9 prefix: "--force-cells" doc: | Force pipeline to use this number of cells, bypassing the cell detection algorithm. Use this if the number of cells estimated by Cell Ranger is not consistent with the barcode rank plot. - include_introns: + no_bam: type: boolean? inputBinding: - position: 8 - prefix: "--include-introns" + position: 10 + prefix: "--no-bam" doc: | - Add this flag to count reads mapping to intronic regions. - This may improve sensitivity for samples with a significant - amount of pre-mRNA molecules, such as nuclei. + Set this flag to not generate the BAM file. This will reduce the total computation + time for the pipestance and the size of the output directory. If unsure, we recommend + not to use this option. BAM file could be useful for troubleshooting and downstream + analysis + + exclude_introns: + type: boolean? + inputBinding: + position: 11 + prefix: "--include-introns=false" + doc: | + In Cell Ranger v7.0 intronic reads are counted by default for whole transcriptome + gene expression data, except when --target-panel is used. Therefore, here we provide + a flag to disable this default behavior. threads: type: int? inputBinding: - position: 9 + position: 12 prefix: "--localcores" doc: | Set max cores the pipeline may request at one time. @@ -104,7 +139,7 @@ inputs: memory_limit: type: int? inputBinding: - position: 10 + position: 13 prefix: "--localmem" doc: | Set max GB the pipeline may request at one time @@ -113,7 +148,7 @@ inputs: virt_memory_limit: type: int? inputBinding: - position: 11 + position: 14 prefix: "--localvmem" doc: | Set max virtual address space in GB for the pipeline @@ -137,7 +172,7 @@ outputs: Run summary metrics in CSV format possorted_genome_bam_bai: - type: File + type: File? outputBinding: glob: "sample/outs/possorted_genome_bam.bam" secondaryFiles: @@ -151,7 +186,6 @@ outputs: glob: "sample/outs/filtered_feature_bc_matrix" doc: | Folder with filtered feature-barcode matrices containing only cellular barcodes in MEX format. - When implemented, in Targeted Gene Expression samples, the non-targeted genes won't be present. filtered_feature_bc_matrix_h5: type: File @@ -159,8 +193,6 @@ outputs: glob: "sample/outs/filtered_feature_bc_matrix.h5" doc: | Filtered feature-barcode matrices containing only cellular barcodes in HDF5 format. - When implemented, in Targeted Gene Expression samples, the non-targeted genes won't - be present. raw_feature_bc_matrices_folder: type: Directory @@ -206,7 +238,7 @@ outputs: type: stderr -baseCommand: ["cellranger", "count", "--disable-ui", "--fastqs", ".", "--id", "sample"] +baseCommand: ["cellranger", "count", "--disable-ui", "--fastqs", ".", "--sample", "sample", "--id", "sample"] stdout: cellranger_count_stdout.log @@ -219,9 +251,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cellranger count - generates single cell feature counts for a single library" -s:name: "Cellranger count - generates single cell feature counts for a single library" -s:alternateName: "Counts gene expression and feature barcoding reads from a single sample and GEM well" +label: "Cell Ranger Count Gene Expression" +s:name: "Cell Ranger Count Gene Expression" +s:alternateName: "Quantifies gene expression from a single scRNA-Seq library" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-count.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -259,65 +291,99 @@ s:creator: doc: | - Generates single cell feature counts for a single library. + Cell Ranger Count Gene Expression - Input parameters for Feature Barcode, Targeted Gene Expression and CRISPR-specific - analyses are not implemented, therefore the correspondent outputs are also excluded. + Quantifies gene expression from a single-cell RNA-Seq library. + + New in Cell Ranger v7.0: Intronic reads are counted by default for + whole transcriptome gene expression data. For more details see + https://support.10xgenomics.com/docs/intron-mode-rec + + Input parameters for Feature Barcode, Targeted Gene Expression + analyses are not implemented, therefore the correspondent outputs + are also excluded. Parameters set by default: --disable-ui - no need in any UI when running in Docker container - --id - can be hardcoded as we rename input files anyway - --fastqs - points to the current directory, because input FASTQ files are staged there - - Why do we need to rename input files? - Refer to the "My FASTQs are not named like any of the above examples" section of - https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input + --id - can be hardcoded as we rename input files anyway + --fastqs - points to the current directory, because input + FASTQ files are staged there + --sample - hardcoded to sample as we stage input fastq files + with the hardcoded names + + Not implemented parameters: + --description - not needed for now + --project - no needed to select input files by folder + --lanes - not needed for now + --libraries - needed only for Gene expression + Feature Barcode analysis + --feature-ref - needed only for Feature Barcode analysis + --target-panel - needed only for Targeted Gene Expression analysis + --nosecondary - no reason to disable it + --chemistry - cell ranger will autodetect the library by default + --no-libraries - used only in Feature Barcode analysis + --check-library-compatibility - no reason to disable it + --no-target-umi-filter - needed only for Targeted Gene Expression analysis + --dry - not applicable to our use case + --jobmode - we use default local mode + --mempercore - not used for local mode + --maxjobs - not used for local mode + --jobinterval - not used for local mode + --overrides - not needed for now + --uiport - we disabled UI + --noexit - we disabled UI + --nopreflight - no reason to skip preflight checks s:about: | - Count gene expression and feature barcoding reads from a single sample and GEM well + Count gene expression (targeted or whole-transcriptome) and/or feature barcode reads + from a single sample and GEM well USAGE: - cellranger count [FLAGS] [OPTIONS] --id --transcriptome - - FLAGS: - --no-target-umi-filter Turn off the target UMI filtering subpipeline - --nosecondary Disable secondary analysis, e.g. clustering. Optional - --no-libraries Proceed with processing using a --feature-ref but no Feature Barcode libraries specified with the 'libraries' flag - --dry Do not execute the pipeline. Generate a pipeline invocation (.mro) file and stop - --disable-ui Do not serve the UI - --noexit Keep web UI running after pipestance completes or fails - --nopreflight Skip preflight checks - -h, --help Prints help information + cellranger count [OPTIONS] --id --transcriptome OPTIONS: - --id A unique run id and output folder name [a-zA-Z0-9_-]+ - --description Sample description to embed in output files - --transcriptome Path of folder containing 10x-compatible transcriptome reference - -f, --fastqs ... Path to input FASTQ data - -p, --project Name of the project folder within a mkfastq or bcl2fastq-generated folder to pick FASTQs from - -s, --sample ... Prefix of the filenames of FASTQs to select - --lanes ... Only use FASTQs from selected lanes - --libraries CSV file declaring input library data sources - --feature-ref Feature reference CSV file, declaring Feature Barcode constructs and associated barcodes - --target-panel The target panel CSV file declaring the target panel used, if any - --expect-cells Expected number of recovered cells - --force-cells Force pipeline to use this number of cells, bypassing cell detection - --r1-length Hard trim the input Read 1 to this length before analysis - --r2-length Hard trim the input Read 2 to this length before analysis - --chemistry Assay configuration. NOTE: by default the assay configuration is detected automatically, which is the recommened mode. You usually will not need - to specify a chemistry. Options are: 'auto' for autodetection, 'threeprime' for Single Cell 3', 'fiveprime' for Single Cell 5', 'SC3Pv1' or - 'SC3Pv2' or 'SC3Pv3' for Single Cell 3' v1/v2/v3, 'SC5P-PE' or 'SC5P-R2' for Single Cell 5', paired-end/R2-only, 'SC-FB' for Single Cell Antibody- - only 3' v2 or 5' [default: auto] - --jobmode Job manager to use. Valid options: local (default), sge, lsf, slurm or a .template file. Search for help on "Cluster Mode" at - support.10xgenomics.com for more details on configuring the pipeline to use a compute cluster [default: local] - --localcores Set max cores the pipeline may request at one time. Only applies to local jobs - --localmem Set max GB the pipeline may request at one time. Only applies to local jobs - --localvmem Set max virtual address space in GB for the pipeline. Only applies to local jobs - --mempercore Reserve enough threads for each job to ensure enough memory will be available, assuming each core on your cluster has at least this much memory - available. Only applies in cluster jobmodes - --maxjobs Set max jobs submitted to cluster at one time. Only applies in cluster jobmodes - --jobinterval Set delay between submitting jobs to cluster, in ms. Only applies in cluster jobmodes - --overrides The path to a JSON file that specifies stage-level overrides for cores and memory. Finer-grained than --localcores, --mempercore and --localmem. - Consult the 10x support website for an example override file - --uiport Serve web UI at http://localhost:PORT \ No newline at end of file + --id A unique run id and output folder name [a-zA-Z0-9_-]+ + --description Sample description to embed in output files [default: ] + --transcriptome Path of folder containing 10x-compatible transcriptome reference + --fastqs Path to input FASTQ data + --project Name of the project folder within a mkfastq or bcl2fastq-generated folder from which to pick FASTQs + --sample Prefix of the filenames of FASTQs to select + --lanes Only use FASTQs from selected lanes + --libraries CSV file declaring input library data sources + --feature-ref Feature reference CSV file, declaring Feature Barcode constructs and associated barcodes + --target-panel The target panel CSV file declaring the target panel used, if any. Default analysis will exclude intronic mapped reads, which + is the recommended mode for targeted assay. Use include-introns=true to include intronic mapped reads in analysis + --expect-cells Expected number of recovered cells, used as input to cell calling algorithm + --force-cells Force pipeline to use this number of cells, bypassing cell calling algorithm. [MINIMUM: 10] + --no-bam Set --no-bam to not generate the BAM file. This will reduce the total computation time for the pipestance and the size of the + output directory. If unsure, we recommend not to use this option. BAM file could be useful for troubleshooting and downstream + analysis + --nosecondary Disable secondary analysis, e.g. clustering. Optional + --r1-length Hard trim the input Read 1 to this length before analysis + --r2-length Hard trim the input Read 2 to this length before analysis + --include-introns Include intronic reads in count (default=true unless --target-panel is specified in which case default=false) + --chemistry Assay configuration. NOTE: by default the assay configuration is detected automatically, which is the recommened mode. You + usually will not need to specify a chemistry. Options are: 'auto' for autodetection, 'threeprime' for Single Cell 3', + 'fiveprime' for Single Cell 5', 'SC3Pv1' or 'SC3Pv2' or 'SC3Pv3' for Single Cell 3' v1/v2/v3, 'SC3Pv3LT' for Single Cell 3' + v3 LT, 'SC3Pv3HT' for Single Cell 3' v3 HT, 'SC5P-PE' or 'SC5P-R2' for Single Cell 5', paired-end/R2-only, 'SC-FB' for Single + Cell Antibody-only 3' v2 or 5' [default: auto] + --no-libraries Proceed with processing using a --feature-ref but no Feature Barcode libraries specified with the 'libraries' flag + --check-library-compatibility Whether to check for barcode compatibility between libraries. [default: true] + --no-target-umi-filter Turn off the target UMI filtering subpipeline. Only applies when --target-panel is used + --dry Do not execute the pipeline. Generate a pipeline invocation (.mro) file and stop + --jobmode Job manager to use. Valid options: local (default), sge, lsf, slurm or path to a .template file. Search for help on "Cluster + Mode" at support.10xgenomics.com for more details on configuring the pipeline to use a compute cluster [default: local] + --localcores Set max cores the pipeline may request at one time. Only applies to local jobs + --localmem Set max GB the pipeline may request at one time. Only applies to local jobs + --localvmem Set max virtual address space in GB for the pipeline. Only applies to local jobs + --mempercore Reserve enough threads for each job to ensure enough memory will be available, assuming each core on your cluster has at least + this much memory available. Only applies to cluster jobmodes + --maxjobs Set max jobs submitted to cluster at one time. Only applies to cluster jobmodes + --jobinterval Set delay between submitting jobs to cluster, in ms. Only applies to cluster jobmodes + --overrides The path to a JSON file that specifies stage-level overrides for cores and memory. Finer-grained than --localcores, + --mempercore and --localmem. Consult https://support.10xgenomics.com/ for an example override file + --uiport Serve web UI at http://localhost:PORT + --disable-ui Do not serve the web UI + --noexit Keep web UI running after pipestance completes or fails + --nopreflight Skip preflight checks + -h, --help Print help information \ No newline at end of file diff --git a/tools/cellranger-mkref.cwl b/tools/cellranger-mkref.cwl index 7b0f974f..96a90206 100644 --- a/tools/cellranger-mkref.cwl +++ b/tools/cellranger-mkref.cwl @@ -16,7 +16,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: cumulusprod/cellranger:4.0.0 + dockerPull: cumulusprod/cellranger:7.0.0 inputs: @@ -96,9 +96,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cell Ranger mkref - builds a Cell Ranger compatible indices" -s:name: "Cell Ranger mkref - builds a Cell Ranger compatible indices" -s:alternateName: "Builds a Cell Ranger compatible reference folder from user-supplied genome FASTA and gene GTF files" +label: "Cell Ranger Build Reference Indices" +s:name: "Cell Ranger Build Reference Indices" +s:alternateName: "Builds Cell Ranger compatible reference folder from the custom genome FASTA and gene GTF annotation files" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-mkref.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -136,11 +136,18 @@ s:creator: doc: | - Builds a Cell Ranger compatible reference folder from user-supplied - genome FASTA and gene GTF files. + Cell Ranger Build Reference Indices + + Builds Cell Ranger compatible reference folder from + the custom genome FASTA and gene GTF annotation files. s:about: | + Build a Cell Ranger-compatible reference folder from user-supplied genome + FASTA and gene GTF files. Creates a new folder named after the genome. + + The commands below should be preceded by 'cellranger': + Usage: mkref --genome=NAME ... @@ -169,4 +176,4 @@ s:about: | --ref-version= Optional reference version string to include with reference. -h --help Show this message. - --version Show version. + --version Show version. \ No newline at end of file diff --git a/tools/cellranger-mkvdjref.cwl b/tools/cellranger-mkvdjref.cwl new file mode 100644 index 00000000..e8a80c71 --- /dev/null +++ b/tools/cellranger-mkvdjref.cwl @@ -0,0 +1,176 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement + expressionLib: + - var get_output_folder_name = function() { + if (inputs.output_folder_name == ""){ + var root = inputs.genome_fasta_file.basename.split('.').slice(0,-1).join('.'); + return (root == "")?inputs.genome_fasta_file.basename:root; + } else { + return inputs.output_folder_name; + } + }; + +hints: +- class: DockerRequirement + dockerPull: cumulusprod/cellranger:7.0.0 + + +inputs: + + genome_fasta_file: + type: File + inputBinding: + position: 5 + prefix: "--fasta" + doc: | + Genome FASTA file. Hard/soft-masked files are not allowed. + + annotation_gtf_file: + type: File + inputBinding: + position: 6 + prefix: "--genes" + doc: | + GTF annotation file. Should include gene_biotype/transcript_biotype fields + + output_folder_name: + type: string? + inputBinding: + position: 7 + prefix: "--genome" + valueFrom: $(get_output_folder_name()) + default: "" + doc: | + Unique genome name, used to name output folder + + +outputs: + + indices_folder: + type: Directory + outputBinding: + glob: $(get_output_folder_name()) + doc: | + Cell Ranger V(D)J-compatible reference folder. + This folder will include V(D)J segment FASTA file. + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["cellranger", "mkvdjref"] + + +stdout: cellranger_mkvdjref_stdout.log +stderr: cellranger_mkvdjref_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Cell Ranger Build V(D)J Reference Indices" +s:name: "Cell Ranger Build V(D)J Reference Indices" +s:alternateName: "Build a Cell Ranger V(D)J-compatible reference folder from a user-supplied genome FASTA and gene GTF files" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-mkvdjref.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Cell Ranger Build V(D)J Reference Indices + + Build a Cell Ranger V(D)J-compatible reference folder from: + 1) A user-supplied genome FASTA and gene GTF files. + For example, using files from ENSEMBL. + 2) A FASTA file containing V(D)J segments as per the mkvdjref spec. + For example, using files from IMGT. + + For simplicity purpose only option 1) is supported - user need to + provide GTF annotation file, input --seqs is not implemented. + + Chromosome names in GTF file should correspond to the chromosome + names in FASTA file. + + +s:about: | + Reference preparation tool for 10x Genomics Cell Ranger V(D)J assembler. + + Build a Cell Ranger V(D)J-compatible reference folder from: + 1) A user-supplied genome FASTA and gene GTF files. + For example, using files from ENSEMBL. + OR + 2) A FASTA file containing V(D)J segments as per the mkvdjref spec. + For example, using files from IMGT. + + Creates a new folder named after the genome. + + The commands below should be preceded by 'cellranger': + + Usage: + mkvdjref --genome=NAME --fasta=PATH --genes=PATH ...[options] + mkvdjref --genome=NAME --seqs=PATH [options] + mkvdjref -h | --help | --version + + Arguments: + genome A unique genome name, used to name output folder + [a-zA-Z0-9_-]+. + fasta Path to FASTA file containing your genome reference. + genes One or more GTF files containing annotated genes for + your genome reference. Specify multiple files by + specifying the --genes argument multiple times. The + files will be concatenated. + seqs A FASTA file that directly specifies V(D)J sequences. + This is mutually exclusive with the the "fasta" and + "genes" args above. + + Options: + --ref-version= + Optional reference version string to include. + --rm-transcripts=PATH + Path to text file with transcript IDs to ignore. This + file should have one transcript ID per line where + the IDs correspond to the "transcript_id" key in the + GTF info column. + -h --help Show this message. + --version Show version. diff --git a/tools/cellranger-multi.cwl b/tools/cellranger-multi.cwl new file mode 100644 index 00000000..95efc03b --- /dev/null +++ b/tools/cellranger-multi.cwl @@ -0,0 +1,564 @@ +cwlVersion: v1.0 +class: CommandLineTool + +# https://www.10xgenomics.com/resources/datasets/human-b-cells-from-a-healthy-donor-1-k-cells-2-standard-6-0-0 +# https://support.10xgenomics.com/single-cell-vdj/software/pipelines/latest/using/multi#what +# https://support.10xgenomics.com/single-cell-vdj/software/pipelines/latest/output/overview-multi +# https://support.10xgenomics.com/single-cell-vdj/software/pipelines/latest/output/annotation#consensus +# https://support.10xgenomics.com/single-cell-vdj/software/pipelines/latest/output/annotation#airr +# https://support.10xgenomics.com/single-cell-vdj/software/pipelines/latest/output/annotation#clonotype +# https://support.10xgenomics.com/single-cell-vdj/software/pipelines/latest/output/overview#header +# https://support.10xgenomics.com/single-cell-vdj/software/pipelines/latest/output/bam + + +requirements: +- class: InlineJavascriptRequirement +- class: InitialWorkDirRequirement + listing: | + ${ + var listing = [ + { + "entry": inputs.gex_fastq_file_r1, + "entryname": "gex_S1_L001_R1_001.fastq", + "writable": true + }, + { + "entry": inputs.gex_fastq_file_r2, + "entryname": "gex_S1_L001_R2_001.fastq", + "writable": true + }, + { + "entry": inputs.vdj_fastq_file_r1, + "entryname": "vdj_S1_L001_R1_001.fastq", + "writable": true + }, + { + "entry": inputs.vdj_fastq_file_r2, + "entryname": "vdj_S1_L001_R2_001.fastq", + "writable": true + }, + { + "entry":`[gene-expression] + reference,${inputs.gex_indices_folder.path} + [vdj] + reference,${inputs.vdj_indices_folder.path} + [libraries] + fastq_id,fastqs,lanes,feature_types + gex,${runtime.outdir},1,gene expression, + vdj,${runtime.outdir},1,${inputs.vdj_chain_type}`, + "entryname": "libraries.csv" + } + ] + if (inputs.gex_fastq_file_i1){ + listing.push( + { + "entry": inputs.gex_fastq_file_i1, + "entryname": "gex_S1_L001_I1_001.fastq", + "writable": true + } + ); + }; + if (inputs.gex_fastq_file_i2){ + listing.push( + { + "entry": inputs.gex_fastq_file_i2, + "entryname": "gex_S1_L001_I2_001.fastq", + "writable": true + } + ); + }; + if (inputs.vdj_fastq_file_i1){ + listing.push( + { + "entry": inputs.vdj_fastq_file_i1, + "entryname": "vdj_S1_L001_I1_001.fastq", + "writable": true + } + ); + }; + if (inputs.vdj_fastq_file_i2){ + listing.push( + { + "entry": inputs.vdj_fastq_file_i2, + "entryname": "vdj_S1_L001_I2_001.fastq", + "writable": true + } + ); + }; + return listing; + } + + +hints: +- class: DockerRequirement + dockerPull: cumulusprod/cellranger:7.0.0 + + +inputs: + + gex_fastq_file_r1: + type: File + doc: | + GEX FASTQ read 1 file (will be staged into workdir as gex_S1_L001_R1_001.fastq) + + gex_fastq_file_r2: + type: File + doc: | + GEX FASTQ read 2 file (will be staged into workdir as gex_S1_L001_R2_001.fastq) + + gex_fastq_file_i1: + type: File? + doc: | + GEX FASTQ index i7 file (will be staged into workdir as gex_S1_L001_I1_001.fastq) + + gex_fastq_file_i2: + type: File? + doc: | + GEX FASTQ index i5 file (will be staged into workdir as gex_S1_L001_I2_001.fastq) + + vdj_fastq_file_r1: + type: File + doc: | + V(D)J FASTQ read 1 file (will be staged into workdir as vdj_S1_L001_R1_001.fastq) + + vdj_fastq_file_r2: + type: File + doc: | + V(D)J FASTQ read 2 file (will be staged into workdir as vdj_S1_L001_R2_001.fastq) + + vdj_fastq_file_i1: + type: File? + doc: | + V(D)J FASTQ index i7 file (will be staged into workdir as vdj_S1_L001_I1_001.fastq) + + vdj_fastq_file_i2: + type: File? + doc: | + V(D)J FASTQ index i5 file (will be staged into workdir as vdj_S1_L001_I2_001.fastq) + + gex_indices_folder: + type: Directory + doc: | + Path of folder containing 10x-compatible transcriptome reference. + Should be generated by "cellranger mkref" command + + vdj_indices_folder: + type: Directory + doc: | + Path of folder containing Cell Ranger V(D)J-compatible reference. + Should be generated by "cellranger mkvdjref" command + + vdj_chain_type: + type: + - "null" + - type: enum + name: "chain_type" + symbols: + - "VDJ" + - "VDJ-T" + - "VDJ-B" + - "VDJ-T-GD" + default: "VDJ" + doc: | + V(D)J chain type. Setting to VDJ will auto-detect the chain type. + Auto-detection does not work for TRG/D (gamma-delta) chains. + Note that gamma-delta analysis is enabled but the algorithm has + not been tested extensively. + + threads: + type: int? + inputBinding: + position: 10 + prefix: "--localcores" + doc: | + Set max cores the pipeline may request at one time. + Default: all available + + memory_limit: + type: int? + inputBinding: + position: 11 + prefix: "--localmem" + doc: | + Set max GB the pipeline may request at one time + Default: all available + + virt_memory_limit: + type: int? + inputBinding: + position: 12 + prefix: "--localvmem" + doc: | + Set max virtual address space in GB for the pipeline + Default: all available + + +outputs: + + web_summary_report: + type: File + outputBinding: + glob: "sample/outs/per_sample_outs/sample/web_summary.html" + doc: | + Run summary metrics and charts in HTML format + + metrics_summary_report: + type: File + outputBinding: + glob: "sample/outs/per_sample_outs/sample/metrics_summary.csv" + doc: | + Run summary metrics in CSV format + + possorted_genome_bam_bai: + type: File + outputBinding: + glob: "sample/outs/per_sample_outs/sample/count/sample_alignments.bam" + secondaryFiles: + - .bai + doc: | + Indexed GEX BAM file containing position-sorted reads aligned to the genome + and transcriptome, as well as unaligned reads. + + filtered_feature_bc_matrix_folder: + type: Directory + outputBinding: + glob: "sample/outs/per_sample_outs/sample/count/sample_filtered_feature_bc_matrix" + doc: | + Folder with filtered feature-barcode matrices containing only cellular + barcodes in MEX format. Each element of the matrix is the number of UMIs + associated with a feature (row) and a barcode (column). + + filtered_feature_bc_matrix_h5: + type: File + outputBinding: + glob: "sample/outs/per_sample_outs/sample/count/sample_filtered_feature_bc_matrix.h5" + doc: | + Filtered feature-barcode matrices containing only cellular + barcodes in HDF5 format. Each element of the matrix is the number of UMIs + associated with a feature (row) and a barcode (column). + + raw_feature_bc_matrices_folder: + type: Directory + outputBinding: + glob: "sample/outs/multi/count/raw_feature_bc_matrix" + doc: | + Folder with unfiltered feature-barcode matrices containing all barcodes + in MEX format. Each element of the matrix is the number of UMIs associated + with a feature (row) and a barcode (column). + + raw_feature_bc_matrices_h5: + type: File + outputBinding: + glob: "sample/outs/multi/count/raw_feature_bc_matrix.h5" + doc: | + Unfiltered feature-barcode matrices containing all barcodes in HDF5 format. + Each element of the matrix is the number of UMIs associated with a feature + (row) and a barcode (column). + + secondary_analysis_report_folder: + type: Directory + outputBinding: + glob: "sample/outs/per_sample_outs/sample/count/analysis" + doc: | + Folder with secondary analysis of GEX data including dimensionality reduction, + cell clustering, and differential expression + + loupe_browser_track: + type: File + outputBinding: + glob: "sample/outs/per_sample_outs/sample/count/sample_cloupe.cloupe" + doc: | + Loupe Browser visualization and analysis file + + all_contig_reads_bam_bai: + type: File + outputBinding: + glob: "sample/outs/multi/vdj_*/all_contig.bam" + secondaryFiles: + - .bai + doc: | + Indexed V(D)J BAM file with reads aligned to ALL assembled contigs, per cell barcode. + This file demonstrates how the reads and UMIs support the assembled contigs within + a cell barcode. Reads are not aligned across cell barcode boundaries. Please note + that this BAM excludes reads whose barcodes don't match the whitelist, so it is not + suitable as an archive of every single input read. + This file includes reads from all cells barcodes identified by V(D)J algorithm including + those ones that will be later discarded as non-viable cells by V(D)J algorithm and those + barcodes that will be later removed after overlapping with cells called by GEX algorithm. + + all_contig_sequences_fasta: + type: File + outputBinding: + glob: "sample/outs/multi/vdj_*/all_contig.fasta" + secondaryFiles: + - .fai + doc: | + FASTA format sequence for ALL assembled contigs in the V(D)J library. + This file includes both productive and non-productive contigs with high and low confidence + assembled for all identified cells barcodes including those ones that will be later discarded + as non-viable cells by V(D)J algorithm or after overlapping with cells called by GEX algorithm. + + all_contig_annotations_bed: + type: File + outputBinding: + glob: "sample/outs/multi/vdj_*/all_contig_annotations.bed" + doc: | + BED file with high-level and detailed annotations of ALL assembled contigs (from cell and + background barcodes). Used for further investigation into why some contigs were filtered + out. This file includes both productive and non-productive contigs with high and low + confidence assembled for all identified cells barcodes including those ones that will be + later discarded as non-viable cells by V(D)J algorithm or after overlapping with cells + called by GEX algorithm. + + all_contig_annotations_csv: + type: File + outputBinding: + glob: "sample/outs/multi/vdj_*/all_contig_annotations.csv" + doc: | + CSV file with high-level and detailed annotations of ALL assembled contigs (from cell and + background barcodes). Used for further investigation into why some contigs were filtered + out. This file includes both productive and non-productive contigs with high and low + confidence assembled for all identified cells barcodes including those ones that will be + later discarded as non-viable cells by V(D)J algorithm or after overlapping with cells + called by GEX algorithm. + + airr_rearrangement_tsv: + type: File + outputBinding: + glob: "sample/outs/per_sample_outs/sample/vdj_*/airr_rearrangement.tsv" + doc: | + Annotated contigs and consensus sequences of V(D)J rearrangements + in the AIRR format. It includes only viable cells identified by + both V(D)J and GEX algorithms. + + clonotypes_csv: + type: File + outputBinding: + glob: "sample/outs/per_sample_outs/sample/vdj_*/clonotypes.csv" + doc: | + CSV file with high-level descriptions of each clonotype. During the clonotype + grouping stage, cell barcodes are placed in groups called clonotypes. Only viable + cells identified by both V(D)J and GEX algorithms are used. Each clonotype consists + of all descendants of a single, fully rearranged common ancestor, as approximated + computationally. During this process, some cell barcodes are flagged as likely + artifacts and filtered out, meaning that they are no longer called as cells. + However, as clonotype grouping stage is hapenning before forming the final version + of files in the per_sample_outs folder, the reported cells number won't be affected. + + germline_contigs_bam_bai: + type: File + outputBinding: + glob: "sample/outs/per_sample_outs/sample/vdj_*/concat_ref.bam" + secondaryFiles: + - .bai + doc: | + Indexed V(D)J BAM file with contigs aligned to concatenated germline + segments. For each clonotype consensus, the reference sequence is the + annotated germline segments concatenated together. This file shows how + both the per-cell contigs and the clonotype consensus contig relate to + the germline reference. Useful for revealing polymorphisms, somatic + mutations, and recombination-induced differences such as non-templated + nucleotide additions. + + germline_sequences_fasta: + type: File + outputBinding: + glob: "sample/outs/per_sample_outs/sample/vdj_*/concat_ref.fasta" + secondaryFiles: + - .fai + doc: | + Concatenated V(D)J reference segments for the segments detected on each + consensus sequence. These serve as an approximate reference for each + consensus sequence. + + consensus_contigs_bam_bai: + type: File + outputBinding: + glob: "sample/outs/per_sample_outs/sample/vdj_*/consensus.bam" + secondaryFiles: + - .bai + doc: | + Indexed V(D)J BAM file with contigs aligned to clonotype consensus. + Each "reference" sequence is a clonotype consensus sequence, and each + record is an alignment of a single cell's contig against this consensus. + This file shows, for a clonotype consensus sequences, how the constituent + per-cell assemblies support the consensus. + + consensus_sequences_fasta: + type: File + outputBinding: + glob: "sample/outs/per_sample_outs/sample/vdj_*/consensus.fasta" + secondaryFiles: + - .fai + doc: | + The consensus sequence of each assembled contig. + + consensus_annotations_csv: + type: File + outputBinding: + glob: "sample/outs/per_sample_outs/sample/vdj_*/consensus_annotations.csv" + doc: | + CSV file with high-level and detailed annotations of each clonotype + consensus sequence. + + filtered_contig_annotations_csv: + type: File + outputBinding: + glob: "sample/outs/per_sample_outs/sample/vdj_*/filtered_contig_annotations.csv" + doc: | + CSV file with high-level annotations of each high-confidence contig from + cell-associated barcodes. This is a subset of all_contig_annotations.csv. + + filtered_contig_sequences_fasta: + type: File + outputBinding: + glob: "sample/outs/per_sample_outs/sample/vdj_*/filtered_contig.fasta" + doc: | + FASTA format sequence for only high-confidence contigs in cell barcodes. + + loupe_vdj_browser_track: + type: File + outputBinding: + glob: "sample/outs/per_sample_outs/sample/vdj_*/vloupe.vloupe" + doc: | + Loupe V(D)J Browser visualization and analysis file + + filtered_data_folder: + type: Directory + outputBinding: + glob: "./sample/outs/per_sample_outs/sample" + doc: | + Folder containing filtered data, i.e., only cell-associated barcodes. + Used by cellranger aggr to aggregate samples for joint analysis. + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["cellranger", "multi", "--disable-ui", "--csv", "libraries.csv", "--id", "sample"] + + +stdout: cellranger_multi_stdout.log +stderr: cellranger_multi_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Cell Ranger Multi Gene Expression and V(D)J Repertoire Profiling" +s:name: "Cell Ranger Multi Gene Expression and V(D)J Repertoire Profiling" +s:alternateName: "Quantifies gene expression and performs profiling of V(D)J repertoire from a single GEM well" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-multi.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Cell Ranger Multi Gene Expression and V(D)J Repertoire Profiling + ================================================================ + + Quantifies gene expression and performs profiling of V(D)J repertoire + from a single GEM well. + + Parameters set by default: + --disable-ui - no need in any UI when running in Docker container + --id - hardcoded to `sample` to simplify output files location + --csv - points to the file libraries.csv generated based on + the input FASTQ files + + No implemented parameters: + --description + --dry + --jobmode (we will use local by default) + --mempercore + --maxjobs + --jobinterval + --overrides + --uiport + --noexit + --nopreflight + + As for running cellranger aggr with cellranger multi outputs we + need only per_sample_outs/sample folder that already includes all + necessary files, there is no need to return the following files + as separate outputs: + - sample_molecule_info.h5 - used for GEX aggregation + - vdj_contig_info.pb - used for V(D)J aggregation + + Why do we need to rename input files? + https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/using/using/fastq-input + + + The cellranger multi pipeline takes FASTQ files from cellranger mkfastq, BCL Convert, + or bcl2fastq for any combination of 5' single cell gene expression, Feature Barcode + (cell surface protein or antigen) and V(D)J libraries from a single GEM well. It + performs alignment, filtering, barcode counting, and UMI counting on the gene expression + and/or Feature Barcode libraries. It also performs sequence assembly and paired clonotype + calling on the V(D)J libraries. Additionally, the cell calls provided by the gene + expression data are used to improve the cell calls inferred by the V(D)J library. + + +s:about: | + Analyze multiplexed data or combined gene expression/immune profiling/feature barcode data + + USAGE: + cellranger multi [OPTIONS] --id --csv + + OPTIONS: + --id A unique run id and output folder name [a-zA-Z0-9_-]+ + --description Sample description to embed in output files [default: ] + --csv Path of CSV file enumerating input libraries and analysis parameters + --dry Do not execute the pipeline. Generate a pipeline invocation (.mro) file and stop + --jobmode Job manager to use. Valid options: local (default), sge, lsf, slurm or path to a .template file. Search for help on "Cluster Mode" at + support.10xgenomics.com for more details on configuring the pipeline to use a compute cluster [default: local] + --localcores Set max cores the pipeline may request at one time. Only applies to local jobs + --localmem Set max GB the pipeline may request at one time. Only applies to local jobs + --localvmem Set max virtual address space in GB for the pipeline. Only applies to local jobs + --mempercore Reserve enough threads for each job to ensure enough memory will be available, assuming each core on your cluster has at least this much memory + available. Only applies to cluster jobmodes + --maxjobs Set max jobs submitted to cluster at one time. Only applies to cluster jobmodes + --jobinterval Set delay between submitting jobs to cluster, in ms. Only applies to cluster jobmodes + --overrides The path to a JSON file that specifies stage-level overrides for cores and memory. Finer-grained than --localcores, --mempercore and --localmem. + Consult https://support.10xgenomics.com/ for an example override file + --uiport Serve web UI at http://localhost:PORT + --disable-ui Do not serve the web UI + --noexit Keep web UI running after pipestance completes or fails + --nopreflight Skip preflight checks + -h, --help Print help information \ No newline at end of file diff --git a/tools/cellranger-reanalyze.cwl b/tools/cellranger-reanalyze.cwl index 6b0aeb6f..a872c23b 100644 --- a/tools/cellranger-reanalyze.cwl +++ b/tools/cellranger-reanalyze.cwl @@ -4,21 +4,19 @@ class: CommandLineTool requirements: - class: InlineJavascriptRequirement - +- class: DockerRequirement + dockerPull: cumulusprod/cellranger:7.0.0 hints: -- class: DockerRequirement - dockerPull: cumulusprod/cellranger:4.0.0 - class: InitialWorkDirRequirement listing: | ${ const skipped_ids = [ "feature_bc_matrix_h5", - "aggregation_metadata", "selected_barcodes", "selected_genes", "excluded_genes", - "force_cells_num", + "force_cells", "threads", "memory_limit", "virt_memory_limit" @@ -32,7 +30,7 @@ hints: } return [{ "entry": entry, - "entryname": runtime.outdir + "/params.csv" + "entryname": "params.csv" }]; } @@ -45,22 +43,13 @@ inputs: position: 5 prefix: "--matrix" doc: | - Filtered or raw feature-barcode matrices in HDF5 format - - aggregation_metadata: - type: File? - inputBinding: - position: 6 - prefix: "--agg" - doc: | - Aggregation CSV metadata file obtained from cellranger aggr. - This allows you to retain any metadata associated with the - samples for display in Loupe Browser. + A feature-barcode matrix containing data for one genome. + Should be the filtered version, unless using --force-cells selected_barcodes: type: File? inputBinding: - position: 7 + position: 6 prefix: "--barcodes" doc: | A CSV file containing a list of cell barcodes to use for reanalysis, @@ -70,30 +59,28 @@ inputs: selected_genes: type: File? inputBinding: - position: 8 + position: 7 prefix: "--genes" doc: | A CSV file containing a list of gene IDs to use for reanalysis (corresponding to the gene_id field of the reference GTF). All gene IDs must be present in the matrix. Note that only gene features are used in secondary analysis. - Feature Barcode features are ignored. excluded_genes: type: File? inputBinding: - position: 9 + position: 8 prefix: "--exclude-genes" doc: | A CSV file containing a list of gene IDs to exclude for reanalysis (corresponding to the gene_id field of the reference GTF). All gene IDs must be present in the matrix. The exclusion is applied after setting the gene list with --genes. - Note that only gene features are used in secondary analysis. Feature Barcode features - are ignored. + Note that only gene features are used in secondary analysis. - force_cells_num: + force_cells: type: int? inputBinding: - position: 10 + position: 9 prefix: "--force-cells" doc: | Force pipeline to use this number of cells, bypassing the cell detection algorithm. @@ -104,7 +91,7 @@ inputs: threads: type: int? inputBinding: - position: 11 + position: 10 prefix: "--localcores" doc: | Set max cores the pipeline may request at one time. @@ -113,7 +100,7 @@ inputs: memory_limit: type: int? inputBinding: - position: 12 + position: 11 prefix: "--localmem" doc: | Set max GB the pipeline may request at one time @@ -122,7 +109,7 @@ inputs: virt_memory_limit: type: int? inputBinding: - position: 13 + position: 12 prefix: "--localvmem" doc: | Set max virtual address space in GB for the pipeline @@ -366,6 +353,14 @@ outputs: doc: | Reanalyzed run summary metrics and charts in HTML format + filtered_feature_bc_matrix_folder: + type: Directory + outputBinding: + glob: "reanalyzed/outs/filtered_feature_bc_matrix" + doc: | + Folder with filtered feature-barcode matrices containing only cellular + barcodes in MEX format. + reanalyze_params: type: File outputBinding: @@ -389,7 +384,7 @@ outputs: baseCommand: ["cellranger", "reanalyze", "--disable-ui", "--id", "reanalyzed"] arguments: -- valueFrom: $(runtime.outdir + "/params.csv") # fails if it's not absolute path +- valueFrom: "params.csv" prefix: "--params" position: 15 @@ -404,9 +399,11 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cellranger reanalyze - reruns secondary analysis performed on the feature-barcode matrix" -s:name: "Cellranger reanalyze - reruns secondary analysis performed on the feature-barcode matrix" -s:alternateName: "Reruns secondary analysis performed on the feature-barcode matrix (dimensionality reduction, clustering and visualization) using different parameter settings" +label: "Cell Ranger Reanalyze" +s:name: "Cell Ranger Reanalyze" +s:alternateName: | + Reruns secondary analysis performed on the GEX feature-barcode matrix (dimensionality reduction, + clustering and visualization) using different parameter settings s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-reanalyze.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -444,72 +441,69 @@ s:creator: doc: | - Tool runs cellranger reanalyze command to rerun secondary analysis performed on - the feature-barcode matrix (dimensionality reduction, clustering and visualization) + Cell Ranger Reanalyze + + Runs cellranger reanalyze command to rerun secondary analysis performed on the + GEX feature-barcode matrix (dimensionality reduction, clustering and visualization) using different parameter settings. + Rerunning the analysis for aggregated experiments is not currently supported. + Parameters set by default: --disable-ui - no need in any UI when running in Docker container - --id - hardcoded to `reanalyzed` as we want to return the content of the - output folder as separate outputs - - Skipped parameters: - --dry - --noexit - --nopreflight - --description - --jobmode - --mempercore - --maxjobs - --jobinterval - --overrides - --uiport + --id - hardcoded to `reanalyzed` as we want to return the content of the + output folder as separate outputs Skipped outputs as they are identical to inputs: - - Filtered feature-barcode matrices MEX - Filtered feature-barcode matrices HDF5 - - Copy of the input aggregation CSV - Notes: - - Passing `aggregation_metadata` might not work as it will require additional inputs for - all files from that CSV file. Otherwise cellranger will fail to parse it. Address this - question when needed. + Not implemented parameters: + --description - not needed for now + --agg - we don't support reruning secondary analysis from aggregated samples + --dry - not applicable to our use case + --jobmode - we use default local mode + --mempercore - not used for local mode + --maxjobs - not used for local mode + --jobinterval - not used for local mode + --overrides - not needed for now + --uiport - we disabled UI + --noexit - we disabled UI + --nopreflight - no reason to skip preflight checks + s:about: | Re-run secondary analysis (dimensionality reduction, clustering, etc) USAGE: - cellranger reanalyze [FLAGS] [OPTIONS] --id --matrix - - FLAGS: - --dry Do not execute the pipeline. Generate a pipeline invocation (.mro) file and stop - --disable-ui Do not serve the UI - --noexit Keep web UI running after pipestance completes or fails - --nopreflight Skip preflight checks - -h, --help Prints help information + cellranger reanalyze [OPTIONS] --id --matrix OPTIONS: - --id A unique run id and output folder name [a-zA-Z0-9_-]+ - --description Sample description to embed in output files [default: ] - --matrix A feature-barcode matrix containing data for one genome. Should be the filtered version, unless using --force-cells - --params A CSV file specifying analysis parameters. Optional - --barcodes A CSV file containing a list of cell barcodes to use for reanalysis, e.g. barcodes exported from Loupe Browser. Optional - --genes A CSV file containing a list of feature IDs to use for reanalysis. For gene expression, this should correspond to the gene_id field in the - reference GTF should be \(e.g. ENSG... for ENSEMBL-based references\). Optional - --exclude-genes A CSV file containing a list of feature IDs to exclude from reanalysis. For gene expression, this should correspond to the gene_id field in - the reference GTF \(e.g., ENSG... for ENSEMBL-based references\). The exclusion is applied after --genes. Optional - --agg If the input matrix was produced by 'aggr', you may pass the same aggregation CSV in order to retain per-library tag information in the - resulting .cloupe file. This argument is required to enable chemistry batch correction. Optional - --force-cells Force pipeline to use this number of cells, bypassing the cell detection algorithm. Optional - --jobmode Job manager to use. Valid options: local (default), sge, lsf, slurm or a .template file. Search for help on "Cluster Mode" at - support.10xgenomics.com for more details on configuring the pipeline to use a compute cluster [default: local] - --localcores Set max cores the pipeline may request at one time. Only applies to local jobs - --localmem Set max GB the pipeline may request at one time. Only applies to local jobs - --localvmem Set max virtual address space in GB for the pipeline. Only applies to local jobs - --mempercore Reserve enough threads for each job to ensure enough memory will be available, assuming each core on your cluster has at least this much - memory available. Only applies in cluster jobmodes - --maxjobs Set max jobs submitted to cluster at one time. Only applies in cluster jobmodes - --jobinterval Set delay between submitting jobs to cluster, in ms. Only applies in cluster jobmodes - --overrides The path to a JSON file that specifies stage-level overrides for cores and memory. Finer-grained than --localcores, --mempercore and - --localmem. Consult the 10x support website for an example override file - --uiport Serve web UI at http://localhost:PORT \ No newline at end of file + --id A unique run id and output folder name [a-zA-Z0-9_-]+ + --description Sample description to embed in output files [default: ] + --matrix A feature-barcode matrix containing data for one genome. Should be the filtered version, unless using --force-cells + --params A CSV file specifying analysis parameters. Optional + --barcodes A CSV file containing a list of cell barcodes to use for reanalysis, e.g. barcodes exported from Loupe Browser. Optional + --genes A CSV file containing a list of feature IDs to use for reanalysis. For gene expression, this should correspond to the gene_id field in the + reference GTF should be \(e.g. ENSG... for ENSEMBL-based references\). Optional + --exclude-genes A CSV file containing a list of feature IDs to exclude from reanalysis. For gene expression, this should correspond to the gene_id field in + the reference GTF \(e.g., ENSG... for ENSEMBL-based references\). The exclusion is applied after --genes. Optional + --agg If the input matrix was produced by 'aggr', you may pass the same aggregation CSV in order to retain per-library tag information in the + resulting .cloupe file. This argument is required to enable chemistry batch correction. Optional + --force-cells Force pipeline to use this number of cells, bypassing cell calling algorithm. [MINIMUM: 10] + --dry Do not execute the pipeline. Generate a pipeline invocation (.mro) file and stop + --jobmode Job manager to use. Valid options: local (default), sge, lsf, slurm or path to a .template file. Search for help on "Cluster Mode" at + support.10xgenomics.com for more details on configuring the pipeline to use a compute cluster [default: local] + --localcores Set max cores the pipeline may request at one time. Only applies to local jobs + --localmem Set max GB the pipeline may request at one time. Only applies to local jobs + --localvmem Set max virtual address space in GB for the pipeline. Only applies to local jobs + --mempercore Reserve enough threads for each job to ensure enough memory will be available, assuming each core on your cluster has at least this much + memory available. Only applies to cluster jobmodes + --maxjobs Set max jobs submitted to cluster at one time. Only applies to cluster jobmodes + --jobinterval Set delay between submitting jobs to cluster, in ms. Only applies to cluster jobmodes + --overrides The path to a JSON file that specifies stage-level overrides for cores and memory. Finer-grained than --localcores, --mempercore and + --localmem. Consult https://support.10xgenomics.com/ for an example override file + --uiport Serve web UI at http://localhost:PORT + --disable-ui Do not serve the web UI + --noexit Keep web UI running after pipestance completes or fails + --nopreflight Skip preflight checks + -h, --help Print help information \ No newline at end of file diff --git a/tools/collect-stats-sc-arc-count.cwl b/tools/collect-stats-sc-arc-count.cwl new file mode 100644 index 00000000..e38106f5 --- /dev/null +++ b/tools/collect-stats-sc-arc-count.cwl @@ -0,0 +1,116 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: ShellCommandRequirement +- class: InlineJavascriptRequirement + expressionLib: + - var get_output_prefix = function() { + if (inputs.output_prefix) { + return inputs.output_prefix; + } + var root = inputs.metrics_summary_report.basename.split('.').slice(0,-1).join('.'); + var suffix = "_stats"; + return (root == "")?inputs.metrics_summary_report.basename+suffix:root+suffix; + }; + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/scstats:v0.0.1 + + +inputs: + + metrics_summary_report: + type: File + inputBinding: + position: 6 + prefix: "--metrics" + + output_prefix: + type: string? + inputBinding: + position: 7 + prefix: "--output" + valueFrom: $(get_output_prefix()) + default: "" + + +outputs: + + collected_statistics_yaml: + type: File + outputBinding: + glob: $(get_output_prefix()+".yaml") + + collected_statistics_tsv: + type: File + outputBinding: + glob: $(get_output_prefix()+".tsv") + + collected_statistics_md: + type: File + outputBinding: + glob: $(get_output_prefix()+".md") + + +baseCommand: ["cell_ranger_arc_count_stats.py"] + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +s:name: "Cell Ranger ARC Count Statistics" +label: "Cell Ranger ARC Count Statistics" +s:alternateName: "Collects statistics from Cell Ranger ARC Count experiment" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/collect-stats-sc-arc-count.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Cell Ranger ARC Count Statistics + ================================ + + Collects statistics from Cell Ranger ARC Count experiment + + +s:about: | + Collects statistics from Cell Ranger ARC Count experiment diff --git a/tools/collect-stats-sc-count.cwl b/tools/collect-stats-sc-count.cwl new file mode 100644 index 00000000..ac85994e --- /dev/null +++ b/tools/collect-stats-sc-count.cwl @@ -0,0 +1,116 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: ShellCommandRequirement +- class: InlineJavascriptRequirement + expressionLib: + - var get_output_prefix = function() { + if (inputs.output_prefix) { + return inputs.output_prefix; + } + var root = inputs.metrics_summary_report.basename.split('.').slice(0,-1).join('.'); + var suffix = "_stats"; + return (root == "")?inputs.metrics_summary_report.basename+suffix:root+suffix; + }; + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/scstats:v0.0.1 + + +inputs: + + metrics_summary_report: + type: File + inputBinding: + position: 6 + prefix: "--metrics" + + output_prefix: + type: string? + inputBinding: + position: 7 + prefix: "--output" + valueFrom: $(get_output_prefix()) + default: "" + + +outputs: + + collected_statistics_yaml: + type: File + outputBinding: + glob: $(get_output_prefix()+".yaml") + + collected_statistics_tsv: + type: File + outputBinding: + glob: $(get_output_prefix()+".tsv") + + collected_statistics_md: + type: File + outputBinding: + glob: $(get_output_prefix()+".md") + + +baseCommand: ["cell_ranger_count_stats.py"] + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +s:name: "Cell Ranger Count Statistics" +label: "Cell Ranger Count Statistics" +s:alternateName: "Collects statistics from Cell Ranger Count experiment" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/collect-stats-sc-count.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Cell Ranger Count Statistics + ================================ + + Collects statistics from Cell Ranger Count experiment + + +s:about: | + Collects statistics from Cell Ranger Count experiment diff --git a/tools/extract-7z.cwl b/tools/extract-7z.cwl new file mode 100644 index 00000000..7bf2ed84 --- /dev/null +++ b/tools/extract-7z.cwl @@ -0,0 +1,127 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: ShellCommandRequirement +- class: InlineJavascriptRequirement +- class: DockerRequirement + dockerPull: biowardrobe2/scidap:v0.0.3 + + +inputs: + + script: + type: string? + default: | + #!/bin/bash + COMBINED="$0" + function extract { + FILE=$1 + COMBINED=$2 + T=`file -b "${FILE}" | awk '{print $1}'` + case "${T}" in + "bzip2"|"gzip"|"Zip") + 7z e -so "${FILE}" >> "${COMBINED}" + ;; + "ASCII") + cat "${FILE}" >> "${COMBINED}" || true + ;; + *) + echo "Error: file type unknown" + rm -f "${COMBINED}" + exit 1 + esac + } + for FILE in "$@"; do + echo "Extracting:" $FILE; + extract "${FILE}" "${COMBINED}" + done; + inputBinding: + position: 5 + doc: | + Bash script to extract compressed file(s) + + output_filename: + type: string + inputBinding: + position: 6 + doc: | + Output filename for extracted and optionally + merged file(s) + + file_to_extract: + type: + - File + - type: array + items: File + inputBinding: + position: 7 + doc: | + Compressed file(s) to extract + + +outputs: + + extracted_file: + type: File + outputBinding: + glob: $(inputs.output_filename) + + +baseCommand: ["bash", "-c"] + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +s:name: "extract-fastq" +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/extract-7z.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + +doc: | + Tool to decompress input file(s). + If several files are provided, they will be concatenated in + the order that corresponds to files in input. + Bash script's logic: + - check file type, decompress if needed, otherwise just cat + the content of the file + - return 1, if file type is not recognized + + +s:about: | + Tool to decompress input file(s) diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl new file mode 100644 index 00000000..f0994936 --- /dev/null +++ b/tools/sc-atac-cluster.cwl @@ -0,0 +1,644 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement +- class: EnvVarRequirement + envDef: + R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/sc-tools:v0.0.15 + + +inputs: + + query_data_rds: + type: File + inputBinding: + prefix: "--query" + doc: | + Path to the RDS file to load Seurat object from. This file should include + chromatin accessibility information stored in the ATAC assay, as well as + 'atac_lsi' and 'atacumap' dimensionality reductions applied to that assay. + + dimensions: + type: + - "null" + - int + - int[] + inputBinding: + prefix: "--dimensions" + doc: | + Dimensionality to use when constructing nearest-neighbor graph before clustering + (from 1 to 50). If single value N is provided, use from 2 to N dimensions. If + multiple values are provided, subset to only selected dimensions. + Default: from 2 to 10 + + cluster_metric: + type: + - "null" + - type: enum + symbols: + - "euclidean" + - "cosine" + - "manhattan" + - "hamming" + inputBinding: + prefix: "--ametric" + doc: | + Distance metric used when constructing nearest-neighbor graph before clustering. + Default: euclidean + + cluster_algorithm: + type: + - "null" + - type: enum + symbols: + - "louvain" + - "mult-louvain" + - "slm" + - "leiden" + inputBinding: + prefix: "--algorithm" + doc: | + Algorithm for modularity optimization when running clustering. + Default: slm + + resolution: + type: + - "null" + - float + - float[] + inputBinding: + prefix: "--resolution" + doc: | + Clustering resolution applied to the constructed nearest-neighbor graph. + Can be set as an array but only the first item from the list will be used + for cluster labels and peak markers in the UCSC Cell Browser when running + with --cbbuild and --diffpeaks parameters. + Default: 0.3, 0.5, 1.0 + + atac_fragments_file: + type: File? + secondaryFiles: + - .tbi + inputBinding: + prefix: "--fragments" + doc: | + Count and barcode information for every ATAC fragment used in the loaded Seurat + object. File should be saved in TSV format with tbi-index file. + + genes_of_interest: + type: + - "null" + - string + - string[] + inputBinding: + prefix: "--genes" + doc: | + Genes of interest to build Tn5 insertion frequency plots for the nearest peaks. + If loaded Seurat object includes genes expression information in the RNA assay + it will be additionally shown on the right side of the plots. + Ignored if '--fragments' is not provided. + Default: None + + identify_diff_peaks: + type: boolean? + inputBinding: + prefix: "--diffpeaks" + doc: | + Identify differentially accessible peaks between each pair of clusters for all resolutions. + Default: false + + minimum_logfc: + type: float? + inputBinding: + prefix: "--logfc" + doc: | + For differentially accessible peaks identification include only those peaks that + on average have log fold change difference in the chromatin accessibility between + every tested pair of clusters not lower than this value. Ignored if '--diffpeaks' + is not set. + Default: 0.25 + + minimum_pct: + type: float? + inputBinding: + prefix: "--minpct" + doc: | + For differentially accessible peaks identification include only those peaks that + are detected in not lower than this fraction of cells in either of the two tested + clusters. Ignored if '--diffpeaks' is not set. + Default: 0.05 + + test_to_use: + type: + - "null" + - type: enum + symbols: + - "wilcox" + - "bimod" + - "roc" + - "t" + - "negbinom" + - "poisson" + - "LR" + - "MAST" + - "DESeq2" + inputBinding: + prefix: "--testuse" + doc: | + Statistical test to use for differentially accessible peaks identification. + Ignored if '--diffpeaks' is not set. + Default: LR + + export_pdf_plots: + type: boolean? + inputBinding: + prefix: "--pdf" + doc: | + Export plots in PDF. + Default: false + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + inputBinding: + prefix: "--theme" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + + verbose: + type: boolean? + inputBinding: + prefix: "--verbose" + doc: | + Print debug information. + Default: false + + export_h5seurat_data: + type: boolean? + inputBinding: + prefix: "--h5seurat" + doc: | + Save Seurat data to h5seurat file. + Default: false + + export_h5ad_data: + type: boolean? + inputBinding: + prefix: "--h5ad" + doc: | + Save Seurat data to h5ad file. + Default: false + + export_ucsc_cb: + type: boolean? + inputBinding: + prefix: "--cbbuild" + doc: | + Export results to UCSC Cell Browser. Default: false + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix. + Default: ./sc + + parallel_memory_limit: + type: int? + inputBinding: + prefix: "--memory" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Default: 32 + + vector_memory_limit: + type: int? + default: 128 + doc: | + Maximum vector memory in GB allowed to be used by R. + Default: 128 + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. + Default: 1 + + +outputs: + + umap_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_res_*.png" + doc: | + Clustered cells UMAP. + PNG format + + umap_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_res_*.pdf" + doc: | + Clustered cells UMAP. + PDF format + + slh_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_slh_res_*.png" + doc: | + Silhouette scores. Downsampled to max 500 cells per cluster. + PNG format + + slh_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_slh_res_*.pdf" + doc: | + Silhouette scores. Downsampled to max 500 cells per cluster. + PDF format + + umap_spl_idnt_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_idnt_res_*.png" + doc: | + Split by dataset clustered cells UMAP. + PNG format + + umap_spl_idnt_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_idnt_res_*.pdf" + doc: | + Split by dataset clustered cells UMAP. + PDF format + + cmp_gr_clst_spl_idnt_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_clst_spl_idnt_res_*.png" + doc: | + Grouped by cluster split by dataset cells composition plot. Downsampled. + PNG format + + cmp_gr_clst_spl_idnt_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_clst_spl_idnt_res_*.pdf" + doc: | + Grouped by cluster split by dataset cells composition plot. Downsampled. + PDF format + + cmp_gr_idnt_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_idnt_spl_clst_res_*.png" + doc: | + Grouped by dataset split by cluster cells composition plot. Downsampled. + PNG format + + cmp_gr_idnt_spl_clst_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_idnt_spl_clst_res_*.pdf" + doc: | + Grouped by dataset split by cluster cells composition plot. Downsampled. + PDF format + + umap_spl_cnd_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_cnd_res_*.png" + doc: | + Split by grouping condition clustered cells UMAP. + PNG format + + umap_spl_cnd_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_cnd_res_*.pdf" + doc: | + Split by grouping condition clustered cells UMAP. + PDF format + + cmp_gr_clst_spl_cnd_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_clst_spl_cnd_res_*.png" + doc: | + Grouped by cluster split by condition cells composition plot. Downsampled. + PNG format + + cmp_gr_clst_spl_cnd_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_clst_spl_cnd_res_*.pdf" + doc: | + Grouped by cluster split by condition cells composition plot. Downsampled. + PDF format + + cmp_gr_cnd_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_cnd_spl_clst_res_*.png" + doc: | + Grouped by condition split by cluster cells composition plot. Downsampled. + PNG format + + cmp_gr_cnd_spl_clst_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_cnd_spl_clst_res_*.pdf" + doc: | + Grouped by condition split by cluster cells composition plot. Downsampled. + PDF format + + cvrg_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cvrg_res_*.png" + doc: | + Tn5 insertion frequency plot around gene. + PNG format + + cvrg_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cvrg_res_*.pdf" + doc: | + Tn5 insertion frequency plot around gene. + PDF format + + peak_markers_tsv: + type: File? + outputBinding: + glob: "*_peak_markers.tsv" + doc: | + Differentially accessible peaks between each pair of clusters for all resolutions. + TSV format + + ucsc_cb_config_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser" + doc: | + Directory with UCSC Cellbrowser configuration data. + + ucsc_cb_html_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser/html_data" + doc: | + Directory with UCSC Cellbrowser html data. + + ucsc_cb_html_file: + type: File? + outputBinding: + glob: "*_cellbrowser/html_data/index.html" + doc: | + HTML index file from the directory with UCSC Cellbrowser html data. + + seurat_data_rds: + type: File + outputBinding: + glob: "*_data.rds" + doc: | + Reduced Seurat data in RDS format + + seurat_data_h5seurat: + type: File? + outputBinding: + glob: "*_data.h5seurat" + doc: | + Reduced Seurat data in h5seurat format + + seurat_data_h5ad: + type: File? + outputBinding: + glob: "*_data.h5ad" + doc: | + Reduced Seurat data in h5ad format + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["sc_atac_cluster.R"] + +stdout: sc_atac_cluster_stdout.log +stderr: sc_atac_cluster_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Single-cell ATAC-Seq Cluster Analysis" +s:name: "Single-cell ATAC-Seq Cluster Analysis" +s:alternateName: "Clusters single-cell ATAC-Seq datasets, identifies differentially accessible peaks" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-atac-cluster.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell ATAC-Seq Cluster Analysis + + Clusters single-cell ATAC-Seq datasets, identifies differentially + accessible peaks. + + +s:about: | + usage: sc_atac_cluster.R + [-h] --query QUERY [--dimensions [DIMENSIONS ...]] + [--ametric {euclidean,cosine,manhattan,hamming}] + [--algorithm {louvain,mult-louvain,slm,leiden}] + [--resolution [RESOLUTION ...]] [--fragments FRAGMENTS] + [--genes [GENES ...]] [--diffpeaks] [--logfc LOGFC] [--minpct MINPCT] + [--testuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] + [--pdf] [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + + Single-cell ATAC-Seq Cluster Analysis + + options: + -h, --help show this help message and exit + --query QUERY Path to the RDS file to load Seurat object from. This + file should include chromatin accessibility + information stored in the ATAC assay, as well as + 'atac_lsi' and 'atacumap' dimensionality reductions + applied to that assay. + --dimensions [DIMENSIONS ...] + Dimensionality to use when constructing nearest- + neighbor graph before clustering (from 1 to 50). If + single value N is provided, use from 2 to N + dimensions. If multiple values are provided, subset to + only selected dimensions. Default: from 2 to 10 + --ametric {euclidean,cosine,manhattan,hamming} + Distance metric used when constructing nearest- + neighbor graph before clustering. Default: euclidean + --algorithm {louvain,mult-louvain,slm,leiden} + Algorithm for modularity optimization when running + clustering. Default: slm + --resolution [RESOLUTION ...] + Clustering resolution applied to the constructed + nearest-neighbor graph. Can be set as an array but + only the first item from the list will be used for + cluster labels and peak markers in the UCSC Cell + Browser when running with --cbbuild and --diffpeaks + parameters. Default: 0.3, 0.5, 1.0 + --fragments FRAGMENTS + Count and barcode information for every ATAC fragment + used in the loaded Seurat object. File should be saved + in TSV format with tbi-index file. + --genes [GENES ...] Genes of interest to build Tn5 insertion frequency + plots for the nearest peaks. If loaded Seurat object + includes genes expression information in the RNA assay + it will be additionally shown on the right side of the + plots. Ignored if '--fragments' is not provided. + Default: None + --diffpeaks Identify differentially accessible peaks between each + pair of clusters for all resolutions. Default: false + --logfc LOGFC For differentially accessible peaks identification + include only those peaks that on average have log fold + change difference in the chromatin accessibility + between every tested pair of clusters not lower than + this value. Ignored if '--diffpeaks' is not set. + Default: 0.25 + --minpct MINPCT For differentially accessible peaks identification + include only those peaks that are detected in not + lower than this fraction of cells in either of the two + tested clusters. Ignored if '--diffpeaks' is not set. + Default: 0.05 + --testuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2} + Statistical test to use for differentially accessible + peaks identification. Ignored if '--diffpeaks' is not + set. Default: LR + --pdf Export plots in PDF. Default: false + --verbose Print debug information. Default: false + --h5seurat Save Seurat data to h5seurat file. Default: false + --h5ad Save Seurat data to h5ad file. Default: false + --cbbuild Export results to UCSC Cell Browser. Default: false + --output OUTPUT Output prefix. Default: ./sc + --theme {gray,bw,linedraw,light,dark,minimal,classic,void} + Color theme for all generated plots. Default: classic + --cpus CPUS Number of cores/cpus to use. Default: 1 + --memory MEMORY Maximum memory in GB allowed to be shared between the + workers when using multiple --cpus. Default: 32 \ No newline at end of file diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl new file mode 100644 index 00000000..51fd7ded --- /dev/null +++ b/tools/sc-atac-reduce.cwl @@ -0,0 +1,599 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement +- class: EnvVarRequirement + envDef: + R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/sc-tools:v0.0.15 + + +inputs: + + query_data_rds: + type: File + inputBinding: + prefix: "--query" + doc: | + Path to the RDS file to load Seurat object from. This file should include + chromatin accessibility information stored in the ATAC assay. + + datasets_metadata: + type: File? + inputBinding: + prefix: "--metadata" + doc: | + Path to the TSV/CSV file to optionally extend Seurat + object metadata with categorical values using samples + identities. First column - 'library_id' should + correspond to all unique values from the 'new.ident' + column of the loaded Seurat object. If any of the + provided in this file columns are already present in + the Seurat object metadata, they will be overwritten. + When combined with --barcodes parameter, first the + metadata will be extended, then barcode filtering will + be applied. + Default: no extra metadata is added + + barcodes_data: + type: File? + inputBinding: + prefix: "--barcodes" + doc: | + Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata be selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. + Default: all cells used, no extra metadata is added + + normalization_method: + type: + - "null" + - type: enum + symbols: + - "log-tfidf" + - "tf-logidf" + - "logtf-logidf" + - "idf" + inputBinding: + prefix: "--norm" + doc: | + TF-IDF normalization method applied to chromatin + accessibility counts. log-tfidf - Stuart & Butler et + al. 2019, tf-logidf - Cusanovich & Hill et al. 2018, + logtf-logidf - Andrew Hill, idf - 10x Genomics, + Default: log-tfidf + + integration_method: + type: + - "null" + - type: enum + symbols: + - "signac" + - "harmony" + - "none" + inputBinding: + prefix: "--ntgr" + doc: | + Integration method used for joint analysis of multiple + datasets. Automatically set to 'none' if loaded Suerat + object includes only one dataset. Default: signac + + integrate_by: + type: + - "null" + - string + - string[] + inputBinding: + prefix: "--ntgrby" + doc: | + Column(s) from the Seurat object metadata to define + the variable(s) that should be integrated out when + running multiple datasets integration with harmony. + May include columns from the extra metadata added with + --metadata parameter. Ignored if --ntgr is not set to + harmony. + Default: new.ident + + minimum_var_peaks_perc: + type: int? + inputBinding: + prefix: "--minvarpeaks" + doc: | + Minimum percentile for identifying the top most common peaks as highly variable. + For example, setting to 5 will use the the top 95 percent most common among all cells + peaks as highly variable. These peaks are used for datasets integration, scaling + and dimensionality reduction. + Default: 0 (use all available peaks) + + dimensions: + type: + - "null" + - int + - int[] + inputBinding: + prefix: "--dimensions" + doc: | + Dimensionality to use for datasets integration and + UMAP projection (from 2 to 50). If single value N is + provided, use from 2 to N LSI components. If multiple + values are provided, subset to only selected LSI + components. In combination with --ntgr set to harmony, + selected principle components will be used in Harmony + integration. + Default: from 2 to 10 + + umap_spread: + type: float? + inputBinding: + prefix: "--uspread" + doc: | + The effective scale of embedded points on UMAP. In combination with '--mindist' + it determines how clustered/clumped the embedded points are. + Default: 1 + + umap_mindist: + type: float? + inputBinding: + prefix: "--umindist" + doc: | + Controls how tightly the embedding is allowed compress points together on UMAP. + Larger values ensure embedded points are moreevenly distributed, while smaller + values allow the algorithm to optimise more accurately with regard to local structure. + Sensible values are in the range 0.001 to 0.5. + Default: 0.3 + + umap_neighbors: + type: int? + inputBinding: + prefix: "--uneighbors" + doc: | + Determines the number of neighboring points used in UMAP. Larger values will result + in more global structure being preserved at the loss of detailed local structure. + In general this parameter should often be in the range 5 to 50. + Default: 30 + + umap_metric: + type: + - "null" + - type: enum + symbols: + - "euclidean" + - "manhattan" + - "chebyshev" + - "minkowski" + - "canberra" + - "braycurtis" + - "mahalanobis" + - "wminkowski" + - "seuclidean" + - "cosine" + - "correlation" + - "haversine" + - "hamming" + - "jaccard" + - "dice" + - "russelrao" + - "kulsinski" + - "ll_dirichlet" + - "hellinger" + - "rogerstanimoto" + - "sokalmichener" + - "sokalsneath" + - "yule" + inputBinding: + prefix: "--umetric" + doc: | + The metric to use to compute distances in high dimensional space for UMAP. + Default: cosine + + umap_method: + type: + - "null" + - type: enum + symbols: + - "uwot" + - "uwot-learn" + - "umap-learn" + inputBinding: + prefix: "--umethod" + doc: | + UMAP implementation to run. If set to 'umap-learn' use --umetric 'correlation' + Default: uwot + + export_pdf_plots: + type: boolean? + inputBinding: + prefix: "--pdf" + doc: | + Export plots in PDF. + Default: false + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + inputBinding: + prefix: "--theme" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + + verbose: + type: boolean? + inputBinding: + prefix: "--verbose" + doc: | + Print debug information. + Default: false + + export_h5seurat_data: + type: boolean? + inputBinding: + prefix: "--h5seurat" + doc: | + Save Seurat data to h5seurat file. + Default: false + + export_h5ad_data: + type: boolean? + inputBinding: + prefix: "--h5ad" + doc: | + Save Seurat data to h5ad file. + Default: false + + export_ucsc_cb: + type: boolean? + inputBinding: + prefix: "--cbbuild" + doc: | + Export results to UCSC Cell Browser. Default: false + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix. + Default: ./sc + + parallel_memory_limit: + type: int? + inputBinding: + prefix: "--memory" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Default: 32 + + vector_memory_limit: + type: int? + default: 128 + doc: | + Maximum vector memory in GB allowed to be used by R. + Default: 128 + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. + Default: 1 + + +outputs: + + qc_dim_corr_plot_png: + type: File? + outputBinding: + glob: "*_qc_dim_corr.png" + doc: | + Correlation plots between QC metrics and cells LSI dimensions. + PNG format + + qc_dim_corr_plot_pdf: + type: File? + outputBinding: + glob: "*_qc_dim_corr.pdf" + doc: | + Correlation plots between QC metrics and cells LSI dimensions. + PDF format + + umap_qc_mtrcs_plot_png: + type: File? + outputBinding: + glob: "*_umap_qc_mtrcs.png" + doc: | + QC metrics on cells UMAP. + PNG format + + umap_qc_mtrcs_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_qc_mtrcs.pdf" + doc: | + QC metrics on cells UMAP. + PDF format + + umap_plot_png: + type: File? + outputBinding: + glob: "*_umap.png" + doc: | + Cells UMAP. + PNG format + + umap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap.pdf" + doc: | + Cells UMAP. + PDF format + + umap_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_idnt.png" + doc: | + Split by dataset cells UMAP. + PNG format + + umap_spl_idnt_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_idnt.pdf" + doc: | + Split by dataset cells UMAP. + PDF format + + umap_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_cnd.png" + doc: | + Split by grouping condition cells UMAP. + PNG format + + umap_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_cnd.pdf" + doc: | + Split by grouping condition cells UMAP. + PDF format + + ucsc_cb_config_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser" + doc: | + Directory with UCSC Cellbrowser configuration data. + + ucsc_cb_html_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser/html_data" + doc: | + Directory with UCSC Cellbrowser html data. + + ucsc_cb_html_file: + type: File? + outputBinding: + glob: "*_cellbrowser/html_data/index.html" + doc: | + HTML index file from the directory with UCSC Cellbrowser html data. + + seurat_data_rds: + type: File + outputBinding: + glob: "*_data.rds" + doc: | + Reduced Seurat data in RDS format + + seurat_data_h5seurat: + type: File? + outputBinding: + glob: "*_data.h5seurat" + doc: | + Reduced Seurat data in h5seurat format + + seurat_data_h5ad: + type: File? + outputBinding: + glob: "*_data.h5ad" + doc: | + Reduced Seurat data in h5ad format + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["sc_atac_reduce.R"] + +stdout: sc_atac_reduce_stdout.log +stderr: sc_atac_reduce_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Single-cell ATAC-Seq Dimensionality Reduction Analysis" +s:name: "Single-cell ATAC-Seq Dimensionality Reduction Analysis" +s:alternateName: "Integrates multiple single-cell ATAC-Seq datasets, reduces dimensionality using LSI" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-atac-reduce.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell ATAC-Seq Dimensionality Reduction Analysis + + Integrates multiple single-cell ATAC-Seq datasets, reduces dimensionality using LSI. + + +s:about: | + usage: sc_atac_reduce.R + [-h] --query QUERY [--metadata METADATA] [--barcodes BARCODES] + [--norm {log-tfidf,tf-logidf,logtf-logidf,idf}] + [--ntgr {signac,harmony,none}] [--ntgrby [NTGRBY ...]] + [--minvarpeaks MINVARPEAKS] [--dimensions [DIMENSIONS ...]] + [--uspread USPREAD] [--umindist UMINDIST] [--uneighbors UNEIGHBORS] + [--umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis, + mahalanobis,wminkowski,seuclidean,cosine,correlation,haversine, + hamming,jaccard,dice,russelrao,kulsinski,ll_dirichlet,hellinger, + rogerstanimoto,sokalmichener,sokalsneath,yule}] + [--umethod {uwot,uwot-learn,umap-learn}] [--pdf] [--verbose] + [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + + Single-cell ATAC-Seq Dimensionality Reduction Analysis + + options: + -h, --help show this help message and exit + --query QUERY Path to the RDS file to load Seurat object from. This + file should include chromatin accessibility + information stored in the ATAC assay. + --metadata METADATA Path to the TSV/CSV file to optionally extend Seurat + object metadata with categorical values using samples + identities. First column - 'library_id' should + correspond to all unique values from the 'new.ident' + column of the loaded Seurat object. If any of the + provided in this file columns are already present in + the Seurat object metadata, they will be overwritten. + When combined with --barcodes parameter, first the + metadata will be extended, then barcode filtering will + be applied. Default: no extra metadata is added + --barcodes BARCODES Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata be selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. Default: all cells used, no extra + metadata is added + --norm {log-tfidf,tf-logidf,logtf-logidf,idf} + TF-IDF normalization method applied to chromatin + accessibility counts. log-tfidf - Stuart & Butler et + al. 2019, tf-logidf - Cusanovich & Hill et al. 2018, + logtf-logidf - Andrew Hill, idf - 10x Genomics, + Default: log-tfidf + --ntgr {signac,harmony,none} + Integration method used for joint analysis of multiple + datasets. Automatically set to 'none' if loaded Suerat + object includes only one dataset. Default: signac + --ntgrby [NTGRBY ...] + Column(s) from the Seurat object metadata to define + the variable(s) that should be integrated out when + running multiple datasets integration with harmony. + May include columns from the extra metadata added with + --metadata parameter. Ignored if --ntgr is not set to + harmony. Default: new.ident + --minvarpeaks MINVARPEAKS + Minimum percentile for identifying the top most common + peaks as highly variable. For example, setting to 5 + will use the the top 95 percent most common among all + cells peaks as highly variable. These peaks are used + for datasets integration, scaling and dimensionality + reduction. Default: 0 (use all available peaks) + --dimensions [DIMENSIONS ...] + Dimensionality to use for datasets integration and + UMAP projection (from 2 to 50). If single value N is + provided, use from 2 to N LSI components. If multiple + values are provided, subset to only selected LSI + components. In combination with --ntgr set to harmony, + selected principle components will be used in Harmony + integration. Default: from 2 to 10 + --uspread USPREAD The effective scale of embedded points on UMAP. In + combination with '--mindist' it determines how + clustered/clumped the embedded points are. Default: 1 + --umindist UMINDIST Controls how tightly the embedding is allowed compress + points together on UMAP. Larger values ensure embedded + points are moreevenly distributed, while smaller + values allow the algorithm to optimise more accurately + with regard to local structure. Sensible values are in + the range 0.001 to 0.5. Default: 0.3 + --uneighbors UNEIGHBORS + Determines the number of neighboring points used in + UMAP. Larger values will result in more global + structure being preserved at the loss of detailed + local structure. In general this parameter should + often be in the range 5 to 50. Default: 30 + --umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis,mahalanobis, + wminkowski,seuclidean,cosine,correlation,haversine,hamming,jaccard,dice, + russelrao,kulsinski,ll_dirichlet,hellinger,rogerstanimoto,sokalmichener, + sokalsneath,yule} + The metric to use to compute distances in high + dimensional space for UMAP. Default: cosine + --umethod {uwot,uwot-learn,umap-learn} + UMAP implementation to run. If set to 'umap-learn' use + --umetric 'correlation' Default: uwot + --pdf Export plots in PDF. Default: false + --verbose Print debug information. Default: false + --h5seurat Save Seurat data to h5seurat file. Default: false + --h5ad Save Seurat data to h5ad file. Default: false + --cbbuild Export results to UCSC Cell Browser. Default: false + --output OUTPUT Output prefix. Default: ./sc + --theme {gray,bw,linedraw,light,dark,minimal,classic,void} + Color theme for all generated plots. Default: classic + --cpus CPUS Number of cores/cpus to use. Default: 1 + --memory MEMORY Maximum memory in GB allowed to be shared between the + workers when using multiple --cpus. Default: 32 \ No newline at end of file diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl new file mode 100644 index 00000000..36a4746a --- /dev/null +++ b/tools/sc-ctype-assign.cwl @@ -0,0 +1,1005 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement +- class: EnvVarRequirement + envDef: + R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/sc-tools:v0.0.15 + + +inputs: + + query_data_rds: + type: File + inputBinding: + prefix: "--query" + doc: | + Path to the RDS file to load Seurat object from. This file should include + genes expression and/or chromatin accessibility information stored in the RNA + and ATAC assays correspondingly. Additionally, 'rnaumap', and/or 'atacumap', + and/or 'wnnumap' dimensionality reductions should be present. + + cell_type_data: + type: File + inputBinding: + prefix: "--celltypes" + doc: | + Path to the TSV/CSV file for manual cell type assignment for each of the clusters. + First column - 'cluster', second column may have arbitrary name. + + query_source_column: + type: string + inputBinding: + prefix: "--source" + doc: | + Column from the metadata of the loaded Seurat object to select clusters from. + + query_target_column: + type: string + inputBinding: + prefix: "--target" + doc: | + Column from the metadata of the loaded Seurat object to save manually + assigned cell types. Should start with 'custom_', otherwise, it won't + be shown in UCSC Cell Browser. + + identify_diff_genes: + type: boolean? + inputBinding: + prefix: "--diffgenes" + doc: | + Identify differentially expressed genes (putative gene markers) for + assigned cell types. Ignored if loaded Seurat object doesn't include + genes expression information stored in the RNA assay. + Default: false + + identify_diff_peaks: + type: boolean? + inputBinding: + prefix: "--diffpeaks" + doc: | + Identify differentially accessible peaks for assigned cell types. Ignored + if loaded Seurat object doesn't include chromatin accessibility information + stored in the ATAC assay. + Default: false + + rna_minimum_logfc: + type: float? + inputBinding: + prefix: "--rnalogfc" + doc: | + For putative gene markers identification include only those genes that + on average have log fold change difference in expression between every + tested pair of cell types not lower than this value. Ignored if '--diffgenes' + is not set or RNA assay is not present. + Default: 0.25 + + rna_minimum_pct: + type: float? + inputBinding: + prefix: "--rnaminpct" + doc: | + For putative gene markers identification include only those genes that + are detected in not lower than this fraction of cells in either of the + two tested cell types. Ignored if '--diffgenes' is not set or RNA assay + is not present. + Default: 0.1 + + only_positive_diff_genes: + type: boolean? + inputBinding: + prefix: "--rnaonlypos" + doc: | + For putative gene markers identification return only positive markers. + Ignored if '--diffgenes' is not set or RNA assay is not present. + Default: false + + rna_test_to_use: + type: + - "null" + - type: enum + symbols: + - "wilcox" + - "bimod" + - "roc" + - "t" + - "negbinom" + - "poisson" + - "LR" + - "MAST" + - "DESeq2" + inputBinding: + prefix: "--rnatestuse" + doc: | + Statistical test to use for putative gene markers identification. + Ignored if '--diffgenes' is not set or RNA assay is not present. + Default: wilcox + + atac_minimum_logfc: + type: float? + inputBinding: + prefix: "--ataclogfc" + doc: | + For differentially accessible peaks identification include only those peaks that + on average have log fold change difference in the chromatin accessibility between + every tested pair of cell types not lower than this value. Ignored if '--diffpeaks' + is not set or ATAC assay is not present. + Default: 0.25 + + atac_minimum_pct: + type: float? + inputBinding: + prefix: "--atacminpct" + doc: | + For differentially accessible peaks identification include only those peaks that + are detected in not lower than this fraction of cells in either of the two tested + cell types. Ignored if '--diffpeaks' is not set or ATAC assay is not present. + Default: 0.05 + + atac_test_to_use: + type: + - "null" + - type: enum + symbols: + - "wilcox" + - "bimod" + - "roc" + - "t" + - "negbinom" + - "poisson" + - "LR" + - "MAST" + - "DESeq2" + inputBinding: + prefix: "--atactestuse" + doc: | + Statistical test to use for differentially accessible peaks identification. + Ignored if '--diffpeaks' is not set or ATAC assay is not present. + Default: LR + + atac_fragments_file: + type: File? + secondaryFiles: + - .tbi + inputBinding: + prefix: "--fragments" + doc: | + Count and barcode information for every ATAC fragment used in the loaded Seurat + object. File should be saved in TSV format with tbi-index file. Ignored if the + loaded Seurat object doesn't include ATAC assay. + + genes_of_interest: + type: + - "null" + - string + - string[] + inputBinding: + prefix: "--genes" + doc: | + Genes of interest to build gene expression and/or Tn5 insertion frequency plots + for the nearest peaks. To build gene expression plots the loaded Seurat object + should include RNA assay. To build Tn5 insertion frequency plots for the nearest + peaks the loaded Seurat object should include ATAC assay as well as the --fragments + file should be provided. + Default: None + + export_pdf_plots: + type: boolean? + inputBinding: + prefix: "--pdf" + doc: | + Export plots in PDF. + Default: false + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + inputBinding: + prefix: "--theme" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + + verbose: + type: boolean? + inputBinding: + prefix: "--verbose" + doc: | + Print debug information. + Default: false + + export_h5seurat_data: + type: boolean? + inputBinding: + prefix: "--h5seurat" + doc: | + Save Seurat data to h5seurat file. + Default: false + + export_h5ad_data: + type: boolean? + inputBinding: + prefix: "--h5ad" + doc: | + Save Seurat data to h5ad file. + Default: false + + export_ucsc_cb: + type: boolean? + inputBinding: + prefix: "--cbbuild" + doc: | + Export results to UCSC Cell Browser. Default: false + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix. + Default: ./sc + + parallel_memory_limit: + type: int? + inputBinding: + prefix: "--memory" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Default: 32 + + vector_memory_limit: + type: int? + default: 128 + doc: | + Maximum vector memory in GB allowed to be used by R. + Default: 128 + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. + Default: 1 + + +outputs: + + umap_rd_rnaumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_rd_rnaumap.png" + doc: | + Cells UMAP with assigned cell types (rnaumap dim. reduction). + PNG format + + umap_rd_rnaumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_rd_rnaumap.pdf" + doc: | + Cells UMAP with assigned cell types (rnaumap dim. reduction). + PDF format + + umap_rd_atacumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_rd_atacumap.png" + doc: | + Cells UMAP with assigned cell types (atacumap dim. reduction). + PNG format + + umap_rd_atacumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_rd_atacumap.pdf" + doc: | + Cells UMAP with assigned cell types (atacumap dim. reduction). + PDF format + + umap_rd_wnnumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_rd_wnnumap.png" + doc: | + Cells UMAP with assigned cell types (wnnumap dim. reduction). + PNG format + + umap_rd_wnnumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_rd_wnnumap.pdf" + doc: | + Cells UMAP with assigned cell types (wnnumap dim. reduction). + PDF format + + umap_spl_idnt_rd_rnaumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_idnt_rd_rnaumap.png" + doc: | + Split by dataset cells UMAP with assigned cell types (rnaumap dim. reduction). + PNG format + + umap_spl_idnt_rd_rnaumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_idnt_rd_rnaumap.pdf" + doc: | + Split by dataset cells UMAP with assigned cell types (rnaumap dim. reduction). + PDF format + + umap_spl_idnt_rd_atacumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_idnt_rd_atacumap.png" + doc: | + Split by dataset cells UMAP with assigned cell types (atacumap dim. reduction). + PNG format + + umap_spl_idnt_rd_atacumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_idnt_rd_atacumap.pdf" + doc: | + Split by dataset cells UMAP with assigned cell types (atacumap dim. reduction). + PDF format + + umap_spl_idnt_rd_wnnumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_idnt_rd_wnnumap.png" + doc: | + Split by dataset cells UMAP with assigned cell types (wnnumap dim. reduction). + PNG format + + umap_spl_idnt_rd_wnnumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_idnt_rd_wnnumap.pdf" + doc: | + Split by dataset cells UMAP with assigned cell types (wnnumap dim. reduction). + PDF format + + umap_spl_cnd_rd_rnaumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_cnd_rd_rnaumap.png" + doc: | + Split by grouping condition cells UMAP with assigned cell types (rnaumap dim. reduction). + PNG format + + umap_spl_cnd_rd_rnaumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_cnd_rd_rnaumap.pdf" + doc: | + Split by grouping condition cells UMAP with assigned cell types (rnaumap dim. reduction). + PDF format + + umap_spl_cnd_rd_atacumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_cnd_rd_atacumap.png" + doc: | + Split by grouping condition cells UMAP with assigned cell types (atacumap dim. reduction). + PNG format + + umap_spl_cnd_rd_atacumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_cnd_rd_atacumap.pdf" + doc: | + Split by grouping condition cells UMAP with assigned cell types (atacumap dim. reduction). + PDF format + + umap_spl_cnd_rd_wnnumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_cnd_rd_wnnumap.png" + doc: | + Split by grouping condition cells UMAP with assigned cell types (wnnumap dim. reduction). + PNG format + + umap_spl_cnd_rd_wnnumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_cnd_rd_wnnumap.pdf" + doc: | + Split by grouping condition cells UMAP with assigned cell types (wnnumap dim. reduction). + PDF format + + umap_spl_ph_rd_rnaumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_ph_rd_rnaumap.png" + doc: | + Split by cell cycle phase cells UMAP with assigned cell types (rnaumap dim. reduction). + PNG format + + umap_spl_ph_rd_rnaumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_ph_rd_rnaumap.pdf" + doc: | + Split by cell cycle phase cells UMAP with assigned cell types (rnaumap dim. reduction). + PDF format + + umap_spl_ph_rd_atacumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_ph_rd_atacumap.png" + doc: | + Split by cell cycle phase cells UMAP with assigned cell types (atacumap dim. reduction). + PNG format + + umap_spl_ph_rd_atacumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_ph_rd_atacumap.pdf" + doc: | + Split by cell cycle phase cells UMAP with assigned cell types (atacumap dim. reduction). + PDF format + + umap_spl_ph_rd_wnnumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_ph_rd_wnnumap.png" + doc: | + Split by cell cycle phase cells UMAP with assigned cell types (wnnumap dim. reduction). + PNG format + + umap_spl_ph_rd_wnnumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_ph_rd_wnnumap.pdf" + doc: | + Split by cell cycle phase cells UMAP with assigned cell types (wnnumap dim. reduction). + PDF format + + cmp_gr_ctyp_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_cmp_gr_ctyp_spl_idnt.png" + doc: | + Grouped by cell type split by dataset cells composition plot. Downsampled. + PNG format + + cmp_gr_ctyp_spl_idnt_plot_pdf: + type: File? + outputBinding: + glob: "*_cmp_gr_ctyp_spl_idnt.pdf" + doc: | + Grouped by cell type split by dataset cells composition plot. Downsampled. + PDF format + + cmp_gr_idnt_spl_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_cmp_gr_idnt_spl_ctyp.png" + doc: | + Grouped by dataset split by cell type cells composition plot. Downsampled. + PNG format + + cmp_gr_idnt_spl_ctyp_plot_pdf: + type: File? + outputBinding: + glob: "*_cmp_gr_idnt_spl_ctyp.pdf" + doc: | + Grouped by dataset split by cell type cells composition plot. Downsampled. + PDF format + + cmp_gr_ph_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_cmp_gr_ph_spl_idnt.png" + doc: | + Grouped by cell cycle phase split by dataset cells composition plot. Downsampled. + PNG format + + cmp_gr_ph_spl_idnt_plot_pdf: + type: File? + outputBinding: + glob: "*_cmp_gr_ph_spl_idnt.pdf" + doc: | + Grouped by cell cycle phase split by dataset cells composition plot. Downsampled. + PDF format + + cmp_gr_ctyp_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_cmp_gr_ctyp_spl_cnd.png" + doc: | + Grouped by cell type split by condition cells composition plot. Downsampled. + PNG format + + cmp_gr_ctyp_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_cmp_gr_ctyp_spl_cnd.pdf" + doc: | + Grouped by cell type split by condition cells composition plot. Downsampled. + PDF format + + cmp_gr_cnd_spl_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_cmp_gr_cnd_spl_ctyp.png" + doc: | + Grouped by condition split by cell type cells composition plot. Downsampled. + PNG format + + cmp_gr_cnd_spl_ctyp_plot_pdf: + type: File? + outputBinding: + glob: "*_cmp_gr_cnd_spl_ctyp.pdf" + doc: | + Grouped by condition split by cell type cells composition plot. Downsampled. + PDF format + + cmp_gr_ph_spl_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_cmp_gr_ph_spl_ctyp.png" + doc: | + Grouped by cell cycle phase split by cell type cells composition plot. Downsampled. + PNG format + + cmp_gr_ph_spl_ctyp_plot_pdf: + type: File? + outputBinding: + glob: "*_cmp_gr_ph_spl_ctyp.pdf" + doc: | + Grouped by cell cycle phase split by cell type cells composition plot. Downsampled. + PDF format + + xpr_avg_plot_png: + type: File? + outputBinding: + glob: "*_xpr_avg.png" + doc: | + Log normalized scaled average gene expression per cell type. + PNG format + + xpr_avg_plot_pdf: + type: File? + outputBinding: + glob: "*_xpr_avg.pdf" + doc: | + Log normalized scaled average gene expression per cell type. + PDF format + + xpr_dnst_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_dnst_*.png" + doc: | + Log normalized gene expression density per cell type. + PNG format + + xpr_dnst_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_dnst_*.pdf" + doc: | + Log normalized gene expression density per cell type. + PDF format + + xpr_per_cell_rd_rnaumap_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_rd_rnaumap_*.png" + doc: | + Log normalized gene expression on cells UMAP with assigned cell types (rnaumap dim. reduction). + PNG format + + xpr_per_cell_rd_rnaumap_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_rd_rnaumap_*.pdf" + doc: | + Log normalized gene expression on cells UMAP with assigned cell types (rnaumap dim. reduction). + PDF format + + xpr_per_cell_rd_atacumap_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_rd_atacumap_*.png" + doc: | + Log normalized gene expression on cells UMAP with assigned cell types (atacumap dim. reduction). + PNG format + + xpr_per_cell_rd_atacumap_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_rd_atacumap_*.pdf" + doc: | + Log normalized gene expression on cells UMAP with assigned cell types (atacumap dim. reduction). + PDF format + + xpr_per_cell_rd_wnnumap_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_rd_wnnumap_*.png" + doc: | + Log normalized gene expression on cells UMAP with assigned cell types (wnnumap dim. reduction). + PNG format + + xpr_per_cell_rd_wnnumap_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_rd_wnnumap_*.pdf" + doc: | + Log normalized gene expression on cells UMAP with assigned cell types (wnnumap dim. reduction). + PDF format + + xpr_per_cell_sgnl_rd_rnaumap_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_sgnl_rd_rnaumap_*.png" + doc: | + Log normalized gene expression density on cells UMAP with assigned cell types (rnaumap dim. reduction). + PNG format + + xpr_per_cell_sgnl_rd_rnaumap_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_sgnl_rd_rnaumap_*.pdf" + doc: | + Log normalized gene expression density on cells UMAP with assigned cell types (rnaumap dim. reduction). + PDF format + + xpr_per_cell_sgnl_rd_atacumap_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_sgnl_rd_atacumap_*.png" + doc: | + Log normalized gene expression density on cells UMAP with assigned cell types (atacumap dim. reduction). + PNG format + + xpr_per_cell_sgnl_rd_atacumap_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_sgnl_rd_atacumap_*.pdf" + doc: | + Log normalized gene expression density on cells UMAP with assigned cell types (atacumap dim. reduction). + PDF format + + xpr_per_cell_sgnl_rd_wnnumap_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_sgnl_rd_wnnumap_*.png" + doc: | + Log normalized gene expression density on cells UMAP with assigned cell types (wnnumap dim. reduction). + PNG format + + xpr_per_cell_sgnl_rd_wnnumap_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_sgnl_rd_wnnumap_*.pdf" + doc: | + Log normalized gene expression density on cells UMAP with assigned cell types (wnnumap dim. reduction). + PDF format + + cvrg_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cvrg_*.png" + doc: | + Tn5 insertion frequency plot around gene. + PNG format + + cvrg_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cvrg_*.pdf" + doc: | + Tn5 insertion frequency plot around gene. + PDF format + + xpr_htmp_plot_png: + type: File? + outputBinding: + glob: "*_xpr_htmp.png" + doc: | + Normalized gene expression heatmap grouped by cell type. + PNG format + + xpr_htmp_plot_pdf: + type: File? + outputBinding: + glob: "*_xpr_htmp.pdf" + doc: | + Normalized gene expression heatmap grouped by cell type. + PDF format + + gene_markers_tsv: + type: File? + outputBinding: + glob: "*_gene_markers.tsv" + doc: | + Differentially expressed genes between each pair of cell types. + TSV format + + peak_markers_tsv: + type: File? + outputBinding: + glob: "*_peak_markers.tsv" + doc: | + Differentially accessible peaks between each pair of cell types. + TSV format + + ucsc_cb_config_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser" + doc: | + Directory with UCSC Cellbrowser configuration data. + + ucsc_cb_html_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser/html_data" + doc: | + Directory with UCSC Cellbrowser html data. + + ucsc_cb_html_file: + type: File? + outputBinding: + glob: "*_cellbrowser/html_data/index.html" + doc: | + HTML index file from the directory with UCSC Cellbrowser html data. + + seurat_data_rds: + type: File + outputBinding: + glob: "*_data.rds" + doc: | + Reduced Seurat data in RDS format + + seurat_data_h5seurat: + type: File? + outputBinding: + glob: "*_data.h5seurat" + doc: | + Reduced Seurat data in h5seurat format + + seurat_data_h5ad: + type: File? + outputBinding: + glob: "*_data.h5ad" + doc: | + Reduced Seurat data in h5ad format + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["sc_ctype_assign.R"] + +stdout: sc_ctype_assign_stdout.log +stderr: sc_ctype_assign_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Single-cell Manual Cell Type Assignment" +s:name: "Single-cell Manual Cell Type Assignment" +s:alternateName: "Assigns cell types for clusters based on the provided metadata file" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-ctype-assign.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell Manual Cell Type Assignment + + Assigns cell types for clusters based on the provided metadata file. + + +s:about: | + usage: sc_ctype_assign.R + [-h] --query QUERY --celltypes CELLTYPES --source SOURCE --target + TARGET [--diffgenes] [--diffpeaks] [--rnalogfc RNALOGFC] + [--rnaminpct RNAMINPCT] [--rnaonlypos] + [--rnatestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] + [--ataclogfc ATACLOGFC] [--atacminpct ATACMINPCT] + [--atactestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] + [--fragments FRAGMENTS] [--genes [GENES ...]] [--pdf] [--verbose] + [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + + Single-cell Manual Cell Type Assignment + + options: + -h, --help show this help message and exit + --query QUERY Path to the RDS file to load Seurat object from. This + file should include genes expression and/or chromatin + accessibility information stored in the RNA and ATAC + assays correspondingly. Additionally, 'rnaumap', + and/or 'atacumap', and/or 'wnnumap' dimensionality + reductions should be present. + --celltypes CELLTYPES + Path to the TSV/CSV file for manual cell type + assignment for each of the clusters. First column - + 'cluster', second column may have arbitrary name. + --source SOURCE Column from the metadata of the loaded Seurat object + to select clusters from. + --target TARGET Column from the metadata of the loaded Seurat object + to save manually assigned cell types. Should start + with 'custom_', otherwise, it won't be shown in UCSC + Cell Browser. + --diffgenes Identify differentially expressed genes (putative gene + markers) for assigned cell types. Ignored if loaded + Seurat object doesn't include genes expression + information stored in the RNA assay. Default: false + --diffpeaks Identify differentially accessible peaks for assigned + cell types. Ignored if loaded Seurat object doesn't + include chromatin accessibility information stored in + the ATAC assay. Default: false + --rnalogfc RNALOGFC For putative gene markers identification include only + those genes that on average have log fold change + difference in expression between every tested pair of + cell types not lower than this value. Ignored if '-- + diffgenes' is not set or RNA assay is not present. + Default: 0.25 + --rnaminpct RNAMINPCT + For putative gene markers identification include only + those genes that are detected in not lower than this + fraction of cells in either of the two tested cell + types. Ignored if '--diffgenes' is not set or RNA + assay is not present. Default: 0.1 + --rnaonlypos For putative gene markers identification return only + positive markers. Ignored if '--diffgenes' is not set + or RNA assay is not present. Default: false + --rnatestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2} + Statistical test to use for putative gene markers + identification. Ignored if '--diffgenes' is not set or + RNA assay is not present. Default: wilcox + --ataclogfc ATACLOGFC + For differentially accessible peaks identification + include only those peaks that on average have log fold + change difference in the chromatin accessibility + between every tested pair of cell types not lower than + this value. Ignored if '--diffpeaks' is not set or + ATAC assay is not present. Default: 0.25 + --atacminpct ATACMINPCT + For differentially accessible peaks identification + include only those peaks that are detected in not + lower than this fraction of cells in either of the two + tested cell types. Ignored if '--diffpeaks' is not set + or ATAC assay is not present. Default: 0.05 + --atactestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2} + Statistical test to use for differentially accessible + peaks identification. Ignored if '--diffpeaks' is not + set or ATAC assay is not present. Default: LR + --fragments FRAGMENTS + Count and barcode information for every ATAC fragment + used in the loaded Seurat object. File should be saved + in TSV format with tbi-index file. Ignored if the + loaded Seurat object doesn't include ATAC assay. + --genes [GENES ...] Genes of interest to build gene expression and/or Tn5 + insertion frequency plots for the nearest peaks. To + build gene expression plots the loaded Seurat object + should include RNA assay. To build Tn5 insertion + frequency plots for the nearest peaks the loaded + Seurat object should include ATAC assay as well as the + --fragments file should be provided. Default: None + --pdf Export plots in PDF. Default: false + --verbose Print debug information. Default: false + --h5seurat Save Seurat data to h5seurat file. Default: false + --h5ad Save Seurat data to h5ad file. Default: false + --cbbuild Export results to UCSC Cell Browser. Default: false + --output OUTPUT Output prefix. Default: ./sc + --theme {gray,bw,linedraw,light,dark,minimal,classic,void} + Color theme for all generated plots. Default: classic + --cpus CPUS Number of cores/cpus to use. Default: 1 + --memory MEMORY Maximum memory in GB allowed to be shared between the + workers when using multiple --cpus. Default: 32 \ No newline at end of file diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl new file mode 100644 index 00000000..6c89c170 --- /dev/null +++ b/tools/sc-multiome-filter.cwl @@ -0,0 +1,1735 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement +- class: InitialWorkDirRequirement + listing: + - entryname: dummy_metadata.csv + entry: | + library_id + Experiment +- class: EnvVarRequirement + envDef: + R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/sc-tools:v0.0.15 + + +inputs: + + feature_bc_matrices_folder: + type: Directory + inputBinding: + prefix: "--mex" + doc: | + Path to the folder with feature-barcode matrix from Cell Ranger ARC Count/Aggregate + experiment in MEX format. The rows consist of all the genes and peaks concatenated + together and the columns are restricted to those barcodes that are identified as cells. + + aggregation_metadata: + type: File? + doc: | + Path to the metadata TSV/CSV file to set the datasets identities. If '--mex' points to + the Cell Ranger ARC Aggregate outputs, the aggr.csv file can be used. If input is not + provided, the default dummy_metadata.csv will be used instead. + + atac_fragments_file: + type: File + secondaryFiles: + - .tbi + inputBinding: + prefix: "--fragments" + doc: | + Count and barcode information for every ATAC fragment observed in the experiment in TSV + format. Tbi-index file is required. + + annotation_gtf_file: + type: File + inputBinding: + prefix: "--annotations" + doc: | + Path to the genome annotation file in GTF format. + + grouping_data: + type: File? + inputBinding: + prefix: "--grouping" + doc: | + Path to the TSV/CSV file to define datasets grouping. + First column - 'library_id' with the values and order + that correspond to the 'library_id' column from the ' + --identity' file, second column 'condition'. + Default: each dataset is assigned to its own group. + + blacklist_regions_file: + type: File? + inputBinding: + prefix: "--blacklist" + doc: | + Path to the optional BED file with the genomic blacklist regions. + + barcodes_data: + type: File? + inputBinding: + prefix: "--barcodes" + doc: | + Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata be selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. + Default: all cells used, no extra metadata is added + + rna_minimum_cells: + type: int? + inputBinding: + prefix: "--rnamincells" + doc: | + Include only genes detected in at least this many cells. + Default: 5 (applied to all datasets) + + minimum_genes: + type: + - "null" + - int + - int[] + inputBinding: + prefix: "--mingenes" + doc: | + Include cells where at least this many genes are detected. If multiple values + provided, each of them will be applied to the correspondent dataset from the + '--mex' input based on the '--identity' file. + Default: 250 (applied to all datasets) + + maximum_genes: + type: + - "null" + - int + - int[] + inputBinding: + prefix: "--maxgenes" + doc: | + Include cells with the number of genes not bigger than this value. If multiple + values provided, each of them will be applied to the correspondent dataset from + the '--mex' input based on the '--identity' file. + Default: 5000 (applied to all datasets) + + rna_minimum_umi: + type: + - "null" + - int + - int[] + inputBinding: + prefix: "--rnaminumi" + doc: | + Include cells where at least this many UMI (RNA transcripts) are detected. + If multiple values provided, each of them will be applied to the correspondent + dataset from the '--mex' input based on the '--identity' file. + Default: 500 (applied to all datasets) + + mito_pattern: + type: string? + inputBinding: + prefix: "--mitopattern" + doc: | + Regex pattern to identify mitochondrial genes. + Default: '^mt-|^MT-' + + maximum_mito_perc: + type: float? + inputBinding: + prefix: "--maxmt" + doc: | + Include cells with the percentage of transcripts mapped to mitochondrial + genes not bigger than this value. + Default: 5 (applied to all datasets) + + minimum_novelty_score: + type: + - "null" + - float + - float[] + inputBinding: + prefix: "--minnovelty" + doc: | + Include cells with the novelty score not lower than this value, calculated for + as log10(genes)/log10(UMI) for RNA assay. If multiple values provided, each of them will + be applied to the correspondent dataset from the '--mex' input based on the + '--identity' file. + Default: 0.8 (applied to all datasets) + + atac_minimum_cells: + type: int? + inputBinding: + prefix: "--atacmincells" + doc: | + Include only peaks detected in at least this many cells. + Default: 5 (applied to all datasets) + + atac_minimum_umi: + type: + - "null" + - int + - int[] + inputBinding: + prefix: "--atacminumi" + doc: | + Include cells where at least this many UMI (ATAC transcripts) are detected. + If multiple values provided, each of them will be applied to the correspondent + dataset from the '--mex' input based on the '--identity' file. + Default: 1000 (applied to all datasets) + + maximum_nucl_signal: + type: + - "null" + - float + - float[] + inputBinding: + prefix: "--maxnuclsignal" + doc: | + Include cells with the nucleosome signal not bigger than this value. + Nucleosome signal quantifies the approximate ratio of mononucleosomal + to nucleosome-free fragments. If multiple values provided, each of + them will be applied to the correspondent dataset from the '--mex' input + based on the '--identity' file. + Default: 4 (applied to all datasets) + + minimum_tss_enrich: + type: + - "null" + - float + - float[] + inputBinding: + prefix: "--mintssenrich" + doc: | + Include cells with the TSS enrichment score not lower than this value. + Score is calculated based on the ratio of fragments centered at the TSS + to fragments in TSS-flanking regions. If multiple values provided, each + of them will be applied to the correspondent dataset from the '--mex' input + based on the '--identity' file. + Default: 2 (applied to all datasets) + + minimum_frip: + type: + - "null" + - float + - float[] + inputBinding: + prefix: "--minfrip" + doc: | + Include cells with the FRiP not lower than this value. If multiple values + provided, each of them will be applied to the correspondent dataset from the + '--mex' input based on the '--identity' file. FRiP is calculated for fragments. + Default: 0.15 (applied to all datasets) + + maximum_blacklist_fraction: + type: + - "null" + - float + - float[] + inputBinding: + prefix: "--maxblacklist" + doc: | + Include cells with the fraction of fragments in + genomic blacklist regions not bigger than this value. + If multiple values provided, each of them will be + applied to the correspondent dataset from the '--mex' + input based on the '--identity' file. + Default: 0.05 (applied to all datasets) + + call_by: + type: string? + inputBinding: + prefix: "--callby" + doc: | + Replace Cell Ranger ARC peaks with MACS2 peaks called + for cells grouped by the column from the optionally + provided --barcodes file. If --barcodes file was not + provided MACS2 peaks can be still called per dataset + by setting --callby to new.ident. Peaks are called + only after applying all RNA related thresholds, + maximum nucleosome signal, and minimum TSS enrichment + scores filters. + Default: do not call peaks + + export_pdf_plots: + type: boolean? + inputBinding: + prefix: "--pdf" + doc: | + Export plots in PDF. + Default: false + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + inputBinding: + prefix: "--theme" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + + verbose: + type: boolean? + inputBinding: + prefix: "--verbose" + doc: | + Print debug information. + Default: false + + export_h5seurat_data: + type: boolean? + inputBinding: + prefix: "--h5seurat" + doc: | + Save Seurat data to h5seurat file. + Default: false + + export_h5ad_data: + type: boolean? + inputBinding: + prefix: "--h5ad" + doc: | + Save Seurat data to h5ad file. + Default: false + + export_ucsc_cb: + type: boolean? + inputBinding: + prefix: "--cbbuild" + doc: | + Export results to UCSC Cell Browser. Default: false + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix. + Default: ./sc + + parallel_memory_limit: + type: int? + inputBinding: + prefix: "--memory" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Default: 32 + + vector_memory_limit: + type: int? + default: 128 + doc: | + Maximum vector memory in GB allowed to be used by R. + Default: 128 + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. + Default: 1 + + +outputs: + + raw_1_2_qc_mtrcs_pca_plot_png: + type: File? + outputBinding: + glob: "*_raw_1_2_qc_mtrcs_pca.png" + doc: | + PC1 and PC2 from the QC metrics PCA (not filtered). + PNG format + + raw_1_2_qc_mtrcs_pca_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_1_2_qc_mtrcs_pca.pdf" + doc: | + PC1 and PC2 from the QC metrics PCA (not filtered). + PDF format + + raw_2_3_qc_mtrcs_pca_plot_png: + type: File? + outputBinding: + glob: "*_raw_2_3_qc_mtrcs_pca.png" + doc: | + PC2 and PC3 from the QC metrics PCA (not filtered). + PNG format + + raw_2_3_qc_mtrcs_pca_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_2_3_qc_mtrcs_pca.pdf" + doc: | + PC2 and PC3 from the QC metrics PCA (not filtered). + PDF format + + raw_cells_count_plot_png: + type: File? + outputBinding: + glob: "*_raw_cells_count.png" + doc: | + Number of cells per dataset (not filtered). + PNG format + + raw_cells_count_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_cells_count.pdf" + doc: | + Number of cells per dataset (not filtered). + PDF format + + raw_rna_umi_dnst_plot_png: + type: File? + outputBinding: + glob: "*_raw_rna_umi_dnst.png" + doc: | + UMI per cell density for RNA assay (not filtered). + PNG format + + raw_rna_umi_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_rna_umi_dnst.pdf" + doc: | + UMI per cell density for RNA assay (not filtered). + PDF format + + raw_gene_dnst_plot_png: + type: File? + outputBinding: + glob: "*_raw_gene_dnst.png" + doc: | + Genes per cell density (not filtered). + PNG format + + raw_gene_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_gene_dnst.pdf" + doc: | + Genes per cell density (not filtered). + PDF format + + raw_gene_umi_corr_plot_png: + type: File? + outputBinding: + glob: "*_raw_gene_umi_corr.png" + doc: | + Genes vs UMI per cell correlation for RNA assay (not filtered). + PNG format + + raw_gene_umi_corr_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_gene_umi_corr.pdf" + doc: | + Genes vs UMI per cell correlation for RNA assay (not filtered). + PDF format + + raw_mito_dnst_plot_png: + type: File? + outputBinding: + glob: "*_raw_mito_dnst.png" + doc: | + Percentage of transcripts mapped to mitochondrial genes per cell density (not filtered). + PNG format + + raw_mito_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_mito_dnst.pdf" + doc: | + Percentage of transcripts mapped to mitochondrial genes per cell density (not filtered). + PDF format + + raw_nvlt_dnst_plot_png: + type: File? + outputBinding: + glob: "*_raw_nvlt_dnst.png" + doc: | + Novelty score per cell density for RNA assay (not filtered). + PNG format + + raw_nvlt_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_nvlt_dnst.pdf" + doc: | + Novelty score per cell density for RNA assay (not filtered). + PDF format + + raw_atac_umi_dnst_plot_png: + type: File? + outputBinding: + glob: "*_raw_atac_umi_dnst.png" + doc: | + UMI per cell density for ATAC assay (not filtered). + PNG format + + raw_atac_umi_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_atac_umi_dnst.pdf" + doc: | + UMI per cell density for ATAC assay (not filtered). + PDF format + + raw_peak_dnst_plot_png: + type: File? + outputBinding: + glob: "*_raw_peak_dnst.png" + doc: | + Peaks per cell density (not filtered). + PNG format + + raw_peak_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_peak_dnst.pdf" + doc: | + Peaks per cell density (not filtered). + PDF format + + raw_blck_dnst_plot_png: + type: File? + outputBinding: + glob: "*_raw_blck_dnst.png" + doc: | + Fraction of ATAC fragments within genomic blacklist regions per cell density (not filtered). + PNG format + + raw_blck_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_blck_dnst.pdf" + doc: | + Fraction of ATAC fragments within genomic blacklist regions per cell density (not filtered). + PDF format + + raw_rna_atac_umi_corr_plot_png: + type: File? + outputBinding: + glob: "*_raw_rna_atac_umi_corr.png" + doc: | + UMI per cell correlation for RNA vs ATAC assays (not filtered). + PNG format + + raw_rna_atac_umi_corr_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_rna_atac_umi_corr.pdf" + doc: | + UMI per cell correlation for RNA vs ATAC assays (not filtered). + PDF format + + raw_tss_atac_umi_corr_plot_png: + type: File? + outputBinding: + glob: "*_raw_tss_atac_umi_corr.png" + doc: | + TSS enrichment score vs UMI per cell correlation for ATAC assay (not filtered). + PNG format + + raw_tss_atac_umi_corr_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_tss_atac_umi_corr.pdf" + doc: | + TSS enrichment score vs UMI per cell correlation for ATAC assay (not filtered). + PDF format + + raw_qc_mtrcs_dnst_plot_png: + type: File? + outputBinding: + glob: "*_raw_qc_mtrcs_dnst.png" + doc: | + QC metrics per cell density (not filtered). + PNG format + + raw_qc_mtrcs_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_qc_mtrcs_dnst.pdf" + doc: | + QC metrics per cell density (not filtered). + PDF format + + raw_tss_nrch_plot_png: + type: File? + outputBinding: + glob: "*_raw_tss_nrch.png" + doc: | + TSS enrichment score (not filtered). + PNG format + + raw_tss_nrch_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_tss_nrch.pdf" + doc: | + TSS enrichment score (not filtered). + PDF format + + raw_frgm_hist_png: + type: File? + outputBinding: + glob: "*_raw_frgm_hist.png" + doc: | + Fragments length histogram (not filtered). + PNG format + + raw_frgm_hist_pdf: + type: File? + outputBinding: + glob: "*_raw_frgm_hist.pdf" + doc: | + Fragments length histogram (not filtered). + PDF format + + raw_rna_umi_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_raw_rna_umi_dnst_spl_cnd.png" + doc: | + Split by grouping condition UMI per cell density for RNA assay (not filtered). + PNG format + + raw_rna_umi_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_rna_umi_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition UMI per cell density for RNA assay (not filtered). + PDF format + + raw_gene_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_raw_gene_dnst_spl_cnd.png" + doc: | + Split by grouping condition genes per cell density (not filtered). + PNG format + + raw_gene_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_gene_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition genes per cell density (not filtered). + PDF format + + raw_mito_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_raw_mito_dnst_spl_cnd.png" + doc: | + Split by grouping condition the percentage of transcripts mapped + to mitochondrial genes per cell density (not filtered). + PNG format + + raw_mito_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_mito_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition the percentage of transcripts mapped + to mitochondrial genes per cell density (not filtered). + PDF format + + raw_nvlt_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_raw_nvlt_dnst_spl_cnd.png" + doc: | + Split by grouping condition the novelty score per cell density for RNA assay (not filtered). + PNG format + + raw_nvlt_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_nvlt_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition the novelty score per cell density for RNA assay (not filtered). + PDF format + + raw_atac_umi_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_raw_atac_umi_dnst_spl_cnd.png" + doc: | + Split by grouping condition UMI per cell density for ATAC assay (not filtered). + PNG format + + raw_atac_umi_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_atac_umi_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition UMI per cell density for ATAC assay (not filtered). + PDF format + + raw_peak_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_raw_peak_dnst_spl_cnd.png" + doc: | + Split by grouping condition peaks per cell density (not filtered). + PNG format + + raw_peak_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_peak_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition peaks per cell density (not filtered). + PDF format + + raw_blck_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_raw_blck_dnst_spl_cnd.png" + doc: | + Split by grouping condition the fraction of ATAC fragments within genomic + blacklist regions per cell density (not filtered). + PNG format + + raw_blck_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_blck_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition the fraction of ATAC fragments within genomic + blacklist regions per cell density (not filtered). + PDF format + + mid_fltr_1_2_qc_mtrcs_pca_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_1_2_qc_mtrcs_pca.png" + doc: | + PC1 and PC2 from the QC metrics PCA (intermediate filtered). + PNG format + + mid_fltr_1_2_qc_mtrcs_pca_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_1_2_qc_mtrcs_pca.pdf" + doc: | + PC1 and PC2 from the QC metrics PCA (intermediate filtered). + PDF format + + mid_fltr_2_3_qc_mtrcs_pca_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_2_3_qc_mtrcs_pca.png" + doc: | + PC2 and PC3 from the QC metrics PCA (intermediate filtered). + PNG format + + mid_fltr_2_3_qc_mtrcs_pca_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_2_3_qc_mtrcs_pca.pdf" + doc: | + PC2 and PC3 from the QC metrics PCA (intermediate filtered). + PDF format + + mid_fltr_cells_count_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_cells_count.png" + doc: | + Number of cells per dataset (intermediate filtered). + PNG format + + mid_fltr_cells_count_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_cells_count.pdf" + doc: | + Number of cells per dataset (intermediate filtered). + PDF format + + mid_fltr_rna_umi_dnst_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_rna_umi_dnst.png" + doc: | + UMI per cell density for RNA assay (intermediate filtered). + PNG format + + mid_fltr_rna_umi_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_rna_umi_dnst.pdf" + doc: | + UMI per cell density for RNA assay (intermediate filtered). + PDF format + + mid_fltr_gene_dnst_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_gene_dnst.png" + doc: | + Genes per cell density (intermediate filtered). + PNG format + + mid_fltr_gene_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_gene_dnst.pdf" + doc: | + Genes per cell density (intermediate filtered). + PDF format + + mid_fltr_gene_umi_corr_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_gene_umi_corr.png" + doc: | + Genes vs UMI per cell correlation for RNA assay (intermediate filtered). + PNG format + + mid_fltr_gene_umi_corr_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_gene_umi_corr.pdf" + doc: | + Genes vs UMI per cell correlation for RNA assay (intermediate filtered). + PDF format + + mid_fltr_mito_dnst_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_mito_dnst.png" + doc: | + Percentage of transcripts mapped to mitochondrial genes per cell density (intermediate filtered). + PNG format + + mid_fltr_mito_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_mito_dnst.pdf" + doc: | + Percentage of transcripts mapped to mitochondrial genes per cell density (intermediate filtered). + PDF format + + mid_fltr_nvlt_dnst_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_nvlt_dnst.png" + doc: | + Novelty score per cell density for RNA assay (intermediate filtered). + PNG format + + mid_fltr_nvlt_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_nvlt_dnst.pdf" + doc: | + Novelty score per cell density for RNA assay (intermediate filtered). + PDF format + + mid_fltr_atac_umi_dnst_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_atac_umi_dnst.png" + doc: | + UMI per cell density for ATAC assay (intermediate filtered). + PNG format + + mid_fltr_atac_umi_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_atac_umi_dnst.pdf" + doc: | + UMI per cell density for ATAC assay (intermediate filtered). + PDF format + + mid_fltr_peak_dnst_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_peak_dnst.png" + doc: | + Peaks per cell density (intermediate filtered). + PNG format + + mid_fltr_peak_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_peak_dnst.pdf" + doc: | + Peaks per cell density (intermediate filtered). + PDF format + + mid_fltr_blck_dnst_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_blck_dnst.png" + doc: | + Fraction of ATAC fragments within genomic blacklist regions per cell density (intermediate filtered). + PNG format + + mid_fltr_blck_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_blck_dnst.pdf" + doc: | + Fraction of ATAC fragments within genomic blacklist regions per cell density (intermediate filtered). + PDF format + + mid_fltr_rna_atac_umi_corr_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_rna_atac_umi_corr.png" + doc: | + UMI per cell correlation for RNA vs ATAC assays (intermediate filtered). + PNG format + + mid_fltr_rna_atac_umi_corr_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_rna_atac_umi_corr.pdf" + doc: | + UMI per cell correlation for RNA vs ATAC assays (intermediate filtered). + PDF format + + mid_fltr_tss_atac_umi_corr_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_tss_atac_umi_corr.png" + doc: | + TSS enrichment score vs UMI per cell correlation for ATAC assay (intermediate filtered). + PNG format + + mid_fltr_tss_atac_umi_corr_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_tss_atac_umi_corr.pdf" + doc: | + TSS enrichment score vs UMI per cell correlation for ATAC assay (intermediate filtered). + PDF format + + mid_fltr_qc_mtrcs_dnst_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_qc_mtrcs_dnst.png" + doc: | + QC metrics per cell density (intermediate filtered). + PNG format + + mid_fltr_qc_mtrcs_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_qc_mtrcs_dnst.pdf" + doc: | + QC metrics per cell density (intermediate filtered). + PDF format + + mid_fltr_tss_nrch_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_tss_nrch.png" + doc: | + TSS enrichment score (intermediate filtered). + PNG format + + mid_fltr_tss_nrch_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_tss_nrch.pdf" + doc: | + TSS enrichment score (intermediate filtered). + PDF format + + mid_fltr_frgm_hist_png: + type: File? + outputBinding: + glob: "*_mid_fltr_frgm_hist.png" + doc: | + Fragments length histogram (intermediate filtered). + PNG format + + mid_fltr_frgm_hist_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_frgm_hist.pdf" + doc: | + Fragments length histogram (intermediate filtered). + PDF format + + mid_fltr_rna_umi_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_rna_umi_dnst_spl_cnd.png" + doc: | + Split by grouping condition UMI per cell density for RNA assay (intermediate filtered). + PNG format + + mid_fltr_rna_umi_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_rna_umi_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition UMI per cell density for RNA assay (intermediate filtered). + PDF format + + mid_fltr_gene_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_gene_dnst_spl_cnd.png" + doc: | + Split by grouping condition genes per cell density (intermediate filtered). + PNG format + + mid_fltr_gene_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_gene_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition genes per cell density (intermediate filtered). + PDF format + + mid_fltr_mito_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_mito_dnst_spl_cnd.png" + doc: | + Split by grouping condition the percentage of transcripts mapped + to mitochondrial genes per cell density (intermediate filtered). + PNG format + + mid_fltr_mito_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_mito_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition the percentage of transcripts mapped + to mitochondrial genes per cell density (intermediate filtered). + PDF format + + mid_fltr_nvlt_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_nvlt_dnst_spl_cnd.png" + doc: | + Split by grouping condition the novelty score per cell density for RNA assay (intermediate filtered). + PNG format + + mid_fltr_nvlt_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_nvlt_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition the novelty score per cell density for RNA assay (intermediate filtered). + PDF format + + mid_fltr_atac_umi_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_atac_umi_dnst_spl_cnd.png" + doc: | + Split by grouping condition UMI per cell density for ATAC assay (intermediate filtered). + PNG format + + mid_fltr_atac_umi_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_atac_umi_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition UMI per cell density for ATAC assay (intermediate filtered). + PDF format + + mid_fltr_peak_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_peak_dnst_spl_cnd.png" + doc: | + Split by grouping condition peaks per cell density (intermediate filtered). + PNG format + + mid_fltr_peak_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_peak_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition peaks per cell density (intermediate filtered). + PDF format + + mid_fltr_blck_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_blck_dnst_spl_cnd.png" + doc: | + Split by grouping condition the fraction of ATAC fragments within genomic + blacklist regions per cell density (intermediate filtered). + PNG format + + mid_fltr_blck_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_blck_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition the fraction of ATAC fragments within genomic + blacklist regions per cell density (intermediate filtered). + PDF format + + fltr_1_2_qc_mtrcs_pca_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_1_2_qc_mtrcs_pca.png" + doc: | + PC1 and PC2 from the QC metrics PCA (filtered). + PNG format + + fltr_1_2_qc_mtrcs_pca_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_1_2_qc_mtrcs_pca.pdf" + doc: | + PC1 and PC2 from the QC metrics PCA (filtered). + PDF format + + fltr_2_3_qc_mtrcs_pca_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_2_3_qc_mtrcs_pca.png" + doc: | + PC2 and PC3 from the QC metrics PCA (filtered). + PNG format + + fltr_2_3_qc_mtrcs_pca_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_2_3_qc_mtrcs_pca.pdf" + doc: | + PC2 and PC3 from the QC metrics PCA (filtered). + PDF format + + fltr_cells_count_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_cells_count.png" + doc: | + Number of cells per dataset (filtered). + PNG format + + fltr_cells_count_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_cells_count.pdf" + doc: | + Number of cells per dataset (filtered). + PDF format + + fltr_rna_umi_dnst_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_rna_umi_dnst.png" + doc: | + UMI per cell density for RNA assay (filtered). + PNG format + + fltr_rna_umi_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_rna_umi_dnst.pdf" + doc: | + UMI per cell density for RNA assay (filtered). + PDF format + + fltr_gene_dnst_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_gene_dnst.png" + doc: | + Genes per cell density (filtered). + PNG format + + fltr_gene_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_gene_dnst.pdf" + doc: | + Genes per cell density (filtered). + PDF format + + fltr_gene_umi_corr_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_gene_umi_corr.png" + doc: | + Genes vs UMI per cell correlation for RNA assay (filtered). + PNG format + + fltr_gene_umi_corr_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_gene_umi_corr.pdf" + doc: | + Genes vs UMI per cell correlation for RNA assay (filtered). + PDF format + + fltr_mito_dnst_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_mito_dnst.png" + doc: | + Percentage of transcripts mapped to mitochondrial genes per cell density (filtered). + PNG format + + fltr_mito_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_mito_dnst.pdf" + doc: | + Percentage of transcripts mapped to mitochondrial genes per cell density (filtered). + PDF format + + fltr_nvlt_dnst_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_nvlt_dnst.png" + doc: | + Novelty score per cell density for RNA assay (filtered). + PNG format + + fltr_nvlt_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_nvlt_dnst.pdf" + doc: | + Novelty score per cell density for RNA assay (filtered). + PDF format + + fltr_atac_umi_dnst_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_atac_umi_dnst.png" + doc: | + UMI per cell density for ATAC assay (filtered). + PNG format + + fltr_atac_umi_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_atac_umi_dnst.pdf" + doc: | + UMI per cell density for ATAC assay (filtered). + PDF format + + fltr_peak_dnst_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_peak_dnst.png" + doc: | + Peaks per cell density (filtered). + PNG format + + fltr_peak_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_peak_dnst.pdf" + doc: | + Peaks per cell density (filtered). + PDF format + + fltr_blck_dnst_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_blck_dnst.png" + doc: | + Fraction of ATAC fragments within genomic blacklist regions per cell density (filtered). + PNG format + + fltr_blck_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_blck_dnst.pdf" + doc: | + Fraction of ATAC fragments within genomic blacklist regions per cell density (filtered). + PDF format + + fltr_rna_atac_umi_corr_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_rna_atac_umi_corr.png" + doc: | + UMI per cell correlation for RNA vs ATAC assays (filtered). + PNG format + + fltr_rna_atac_umi_corr_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_rna_atac_umi_corr.pdf" + doc: | + UMI per cell correlation for RNA vs ATAC assays (filtered). + PDF format + + fltr_tss_atac_umi_corr_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_tss_atac_umi_corr.png" + doc: | + TSS enrichment score vs UMI per cell correlation for ATAC assay (filtered). + PNG format + + fltr_tss_atac_umi_corr_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_tss_atac_umi_corr.pdf" + doc: | + TSS enrichment score vs UMI per cell correlation for ATAC assay (filtered). + PDF format + + fltr_qc_mtrcs_dnst_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_qc_mtrcs_dnst.png" + doc: | + QC metrics per cell density (filtered). + PNG format + + fltr_qc_mtrcs_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_qc_mtrcs_dnst.pdf" + doc: | + QC metrics per cell density (filtered). + PDF format + + fltr_tss_nrch_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_tss_nrch.png" + doc: | + TSS enrichment score (filtered). + PNG format + + fltr_tss_nrch_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_tss_nrch.pdf" + doc: | + TSS enrichment score (filtered). + PDF format + + fltr_frgm_hist_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_frgm_hist.png" + doc: | + Fragments length histogram (filtered). + PNG format + + fltr_frgm_hist_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_frgm_hist.pdf" + doc: | + Fragments length histogram (filtered). + PDF format + + fltr_rna_umi_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_rna_umi_dnst_spl_cnd.png" + doc: | + Split by grouping condition UMI per cell density for RNA assay (filtered). + PNG format + + fltr_rna_umi_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_rna_umi_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition UMI per cell density for RNA assay (filtered). + PDF format + + fltr_gene_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_gene_dnst_spl_cnd.png" + doc: | + Split by grouping condition genes per cell density (filtered). + PNG format + + fltr_gene_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_gene_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition genes per cell density (filtered). + PDF format + + fltr_mito_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_mito_dnst_spl_cnd.png" + doc: | + Split by grouping condition the percentage of transcripts mapped + to mitochondrial genes per cell density (filtered). + PNG format + + fltr_mito_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_mito_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition the percentage of transcripts mapped + to mitochondrial genes per cell density (filtered). + PDF format + + fltr_nvlt_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_nvlt_dnst_spl_cnd.png" + doc: | + Split by grouping condition the novelty score per cell density for RNA assay (filtered). + PNG format + + fltr_nvlt_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_nvlt_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition the novelty score per cell density for RNA assay (filtered). + PDF format + + fltr_atac_umi_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_atac_umi_dnst_spl_cnd.png" + doc: | + Split by grouping condition UMI per cell density for ATAC assay (filtered). + PNG format + + fltr_atac_umi_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_atac_umi_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition UMI per cell density for ATAC assay (filtered). + PDF format + + fltr_peak_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_peak_dnst_spl_cnd.png" + doc: | + Split by grouping condition peaks per cell density (filtered). + PNG format + + fltr_peak_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_peak_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition peaks per cell density (filtered). + PDF format + + fltr_blck_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_blck_dnst_spl_cnd.png" + doc: | + Split by grouping condition the fraction of ATAC fragments within genomic + blacklist regions per cell density (filtered). + PNG format + + fltr_blck_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_blck_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition the fraction of ATAC fragments within genomic + blacklist regions per cell density (filtered). + PDF format + + ucsc_cb_config_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser" + doc: | + Directory with UCSC Cellbrowser configuration data. + + ucsc_cb_html_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser/html_data" + doc: | + Directory with UCSC Cellbrowser html data. + + ucsc_cb_html_file: + type: File? + outputBinding: + glob: "*_cellbrowser/html_data/index.html" + doc: | + HTML index file from the directory with UCSC Cellbrowser html data. + + seurat_data_rds: + type: File + outputBinding: + glob: "*_data.rds" + doc: | + Filtered Seurat data in RDS format + + seurat_data_h5seurat: + type: File? + outputBinding: + glob: "*_data.h5seurat" + doc: | + Filtered Seurat data in h5seurat format + + seurat_data_h5ad: + type: File? + outputBinding: + glob: "*_data.h5ad" + doc: | + Reduced Seurat data in h5ad format + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["sc_multiome_filter.R"] +arguments: +- valueFrom: | + ${ + if (inputs.aggregation_metadata) { + return inputs.aggregation_metadata; + } else { + return runtime.outdir + "/dummy_metadata.csv" + } + } + prefix: "--identity" + + +stdout: sc_multiome_filter_stdout.log +stderr: sc_multiome_filter_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Single-cell Multiome ATAC and RNA-Seq Filtering Analysis" +s:name: "Single-cell Multiome ATAC and RNA-Seq Filtering Analysis" +s:alternateName: "Filters single-cell multiome ATAC and RNA-Seq datasets based on the common QC metrics" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-multiome-filter.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell Multiome ATAC and RNA-Seq Filtering Analysis + + Filters single-cell multiome ATAC and RNA-Seq datasets based on the common QC metrics. + + +s:about: | + usage: sc_multiome_filter.R + [-h] --mex MEX --identity IDENTITY --fragments FRAGMENTS --annotations + ANNOTATIONS [--grouping GROUPING] [--blacklist BLACKLIST] + [--barcodes BARCODES] [--rnamincells RNAMINCELLS] + [--mingenes [MINGENES ...]] [--maxgenes [MAXGENES ...]] + [--rnaminumi [RNAMINUMI ...]] [--mitopattern MITOPATTERN] + [--maxmt MAXMT] [--minnovelty [MINNOVELTY ...]] + [--atacmincells ATACMINCELLS] [--atacminumi [ATACMINUMI ...]] + [--maxnuclsignal [MAXNUCLSIGNAL ...]] + [--mintssenrich [MINTSSENRICH ...]] [--minfrip [MINFRIP ...]] + [--maxblacklist [MAXBLACKLIST ...]] [--callby CALLBY] [--pdf] + [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + + Single-cell Multiome ATAC and RNA-Seq Filtering Analysis + + options: + -h, --help show this help message and exit + --mex MEX Path to the folder with feature-barcode matrix from + Cell Ranger ARC Count/Aggregate experiment in MEX + format. The rows consist of all the genes and peaks + concatenated together and the columns are restricted + to those barcodes that are identified as cells. + --identity IDENTITY Path to the metadata TSV/CSV file to set the datasets + identities. If '--mex' points to the Cell Ranger ARC + Aggregate outputs, the aggr.csv file can be used. If + Cell Ranger ARC Count outputs have been used in the ' + --mex' input, the file should include at least one + column - 'library_id' and one row with the alias for + Cell Ranger ARC Count experiment. + --fragments FRAGMENTS + Count and barcode information for every ATAC fragment + observed in the experiment in TSV format. Tbi-index + file is required. + --annotations ANNOTATIONS + Path to the genome annotation file in GTF format + --grouping GROUPING Path to the TSV/CSV file to define datasets grouping. + First column - 'library_id' with the values and order + that correspond to the 'library_id' column from the ' + --identity' file, second column 'condition'. Default: + each dataset is assigned to its own group. + --blacklist BLACKLIST + Path to the optional BED file with the genomic + blacklist regions. + --barcodes BARCODES Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata be selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. Default: all cells used, no extra + metadata is added + --rnamincells RNAMINCELLS + Include only genes detected in at least this many + cells. Default: 5 (applied to all datasets) + --mingenes [MINGENES ...] + Include cells where at least this many genes are + detected. If multiple values provided, each of them + will be applied to the correspondent dataset from the + '--mex' input based on the '--identity' file. Default: + 250 (applied to all datasets) + --maxgenes [MAXGENES ...] + Include cells with the number of genes not bigger than + this value. If multiple values provided, each of them + will be applied to the correspondent dataset from the + '--mex' input based on the '--identity' file. Default: + 5000 (applied to all datasets) + --rnaminumi [RNAMINUMI ...] + Include cells where at least this many UMI (RNA + transcripts) are detected. If multiple values + provided, each of them will be applied to the + correspondent dataset from the '--mex' input based on + the '--identity' file. Default: 500 (applied to all + datasets) + --mitopattern MITOPATTERN + Regex pattern to identify mitochondrial genes. + Default: '^mt-|^MT-' + --maxmt MAXMT Include cells with the percentage of transcripts + mapped to mitochondrial genes not bigger than this + value. Default: 5 (applied to all datasets) + --minnovelty [MINNOVELTY ...] + Include cells with the novelty score not lower than + this value, calculated for as log10(genes)/log10(UMI) + for RNA assay. If multiple values provided, each of + them will be applied to the correspondent dataset from + the '--mex' input based on the '--identity' file. + Default: 0.8 (applied to all datasets) + --atacmincells ATACMINCELLS + Include only peaks detected in at least this many + cells. Default: 5 (applied to all datasets) + --atacminumi [ATACMINUMI ...] + Include cells where at least this many UMI (ATAC + transcripts) are detected. If multiple values + provided, each of them will be applied to the + correspondent dataset from the '--mex' input based on + the '--identity' file. Default: 1000 (applied to all + datasets) + --maxnuclsignal [MAXNUCLSIGNAL ...] + Include cells with the nucleosome signal not bigger + than this value. Nucleosome signal quantifies the + approximate ratio of mononucleosomal to nucleosome- + free fragments. If multiple values provided, each of + them will be applied to the correspondent dataset from + the '--mex' input based on the '--identity' file. + Default: 4 (applied to all datasets) + --mintssenrich [MINTSSENRICH ...] + Include cells with the TSS enrichment score not lower + than this value. Score is calculated based on the + ratio of fragments centered at the TSS to fragments in + TSS-flanking regions. If multiple values provided, + each of them will be applied to the correspondent + dataset from the '--mex' input based on the '-- + identity' file. Default: 2 (applied to all datasets) + --minfrip [MINFRIP ...] + Include cells with the FRiP not lower than this value. + If multiple values provided, each of them will be + applied to the correspondent dataset from the '--mex' + input based on the '--identity' file. FRiP is + calculated for fragments. Default: 0.15 (applied to + all datasets) + --maxblacklist [MAXBLACKLIST ...] + Include cells with the fraction of fragments in + genomic blacklist regions not bigger than this value. + If multiple values provided, each of them will be + applied to the correspondent dataset from the '--mex' + input based on the '--identity' file. Default: 0.05 + (applied to all datasets) + --callby CALLBY Replace Cell Ranger ARC peaks with MACS2 peaks called + for cells grouped by the column from the optionally + provided --barcodes file. If --barcodes file was not + provided MACS2 peaks can be still called per dataset + by setting --callby to new.ident. Peaks are called + only after applying all RNA related thresholds, + maximum nucleosome signal, and minimum TSS enrichment + scores filters. Default: do not call peaks + --pdf Export plots in PDF. Default: false + --verbose Print debug information. Default: false + --h5seurat Save Seurat data to h5seurat file. Default: false + --h5ad Save Seurat data to h5ad file. Default: false + --cbbuild Export results to UCSC Cell Browser. Default: false + --output OUTPUT Output prefix. Default: ./sc + --theme {gray,bw,linedraw,light,dark,minimal,classic,void} + Color theme for all generated plots. Default: classic + --cpus CPUS Number of cores/cpus to use. Default: 1 + --memory MEMORY Maximum memory in GB allowed to be shared between the + workers when using multiple '--cpus'. Default: 32 \ No newline at end of file diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl new file mode 100644 index 00000000..2ca33041 --- /dev/null +++ b/tools/sc-rna-cluster.cwl @@ -0,0 +1,785 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement +- class: EnvVarRequirement + envDef: + R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/sc-tools:v0.0.15 + + +inputs: + + query_data_rds: + type: File + inputBinding: + prefix: "--query" + doc: | + Path to the RDS file to load Seurat object from. This file should include genes + expression information stored in the RNA assay, as well as 'pca' and 'rnaumap' + dimensionality reductions applied to that assay. + + dimensions: + type: + - "null" + - int + - int[] + inputBinding: + prefix: "--dimensions" + doc: | + Dimensionality to use when constructing nearest- + neighbor graph before clustering (from 1 to 50). If + single value N is provided, use from 1 to N + dimensions. If multiple values are provided, subset to + only selected dimensions. + Default: from 1 to 10 + + cluster_metric: + type: + - "null" + - type: enum + symbols: + - "euclidean" + - "cosine" + - "manhattan" + - "hamming" + inputBinding: + prefix: "--ametric" + doc: | + Distance metric used when constructing nearest-neighbor graph before clustering. + Default: euclidean + + cluster_algorithm: + type: + - "null" + - type: enum + symbols: + - "louvain" + - "mult-louvain" + - "slm" + - "leiden" + inputBinding: + prefix: "--algorithm" + doc: | + Algorithm for modularity optimization when running clustering. + Default: louvain + + resolution: + type: + - "null" + - float + - float[] + inputBinding: + prefix: "--resolution" + doc: | + Clustering resolution applied to the constructed nearest-neighbor graph. + Can be set as an array but only the first item from the list will be used + for cluster labels and gene markers in the UCSC Cell Browser when running + with --cbbuild and --diffgenes parameters. + Default: 0.3, 0.5, 1.0 + + genes_of_interest: + type: + - "null" + - string + - string[] + inputBinding: + prefix: "--genes" + doc: | + Genes of interest to build genes expression plots. + Default: None + + identify_diff_genes: + type: boolean? + inputBinding: + prefix: "--diffgenes" + doc: | + Identify differentially expressed genes (putative gene markers) between each + pair of clusters for all resolutions. + Default: false + + minimum_logfc: + type: float? + inputBinding: + prefix: "--logfc" + doc: | + For putative gene markers identification include only those genes that + on average have log fold change difference in expression between every + tested pair of clusters not lower than this value. Ignored if '--diffgenes' + is not set. + Default: 0.25 + + minimum_pct: + type: float? + inputBinding: + prefix: "--minpct" + doc: | + For putative gene markers identification include only those genes that + are detected in not lower than this fraction of cells in either of the + two tested clusters. Ignored if '--diffgenes' is not set. + Default: 0.1 + + only_positive_diff_genes: + type: boolean? + inputBinding: + prefix: "--onlypos" + doc: | + For putative gene markers identification return only positive markers. + Ignored if '--diffgenes' is not set. + Default: false + + test_to_use: + type: + - "null" + - type: enum + symbols: + - "wilcox" + - "bimod" + - "roc" + - "t" + - "negbinom" + - "poisson" + - "LR" + - "MAST" + - "DESeq2" + inputBinding: + prefix: "--testuse" + doc: | + Statistical test to use for putative gene markers identification. + Ignored if '--diffgenes' is not set. + Default: wilcox + + export_pdf_plots: + type: boolean? + inputBinding: + prefix: "--pdf" + doc: | + Export plots in PDF. + Default: false + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + inputBinding: + prefix: "--theme" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + + verbose: + type: boolean? + inputBinding: + prefix: "--verbose" + doc: | + Print debug information. + Default: false + + export_h5seurat_data: + type: boolean? + inputBinding: + prefix: "--h5seurat" + doc: | + Save Seurat data to h5seurat file. + Default: false + + export_h5ad_data: + type: boolean? + inputBinding: + prefix: "--h5ad" + doc: | + Save Seurat data to h5ad file. + Default: false + + export_ucsc_cb: + type: boolean? + inputBinding: + prefix: "--cbbuild" + doc: | + Export results to UCSC Cell Browser. Default: false + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix. + Default: ./sc + + parallel_memory_limit: + type: int? + inputBinding: + prefix: "--memory" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Default: 32 + + vector_memory_limit: + type: int? + default: 128 + doc: | + Maximum vector memory in GB allowed to be used by R. + Default: 128 + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. + Default: 1 + + +outputs: + + umap_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_res_*.png" + doc: | + Clustered cells UMAP. + PNG format + + umap_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_res_*.pdf" + doc: | + Clustered cells UMAP. + PDF format + + slh_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_slh_res_*.png" + doc: | + Silhouette scores. Downsampled to max 500 cells per cluster. + PNG format + + slh_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_slh_res_*.pdf" + doc: | + Silhouette scores. Downsampled to max 500 cells per cluster. + PDF format + + umap_spl_idnt_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_idnt_res_*.png" + doc: | + Split by dataset clustered cells UMAP. + PNG format + + umap_spl_idnt_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_idnt_res_*.pdf" + doc: | + Split by dataset clustered cells UMAP. + PDF format + + cmp_gr_clst_spl_idnt_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_clst_spl_idnt_res_*.png" + doc: | + Grouped by cluster split by dataset cells composition plot. Downsampled. + PNG format + + cmp_gr_clst_spl_idnt_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_clst_spl_idnt_res_*.pdf" + doc: | + Grouped by cluster split by dataset cells composition plot. Downsampled. + PDF format + + cmp_gr_idnt_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_idnt_spl_clst_res_*.png" + doc: | + Grouped by dataset split by cluster cells composition plot. Downsampled. + PNG format + + cmp_gr_idnt_spl_clst_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_idnt_spl_clst_res_*.pdf" + doc: | + Grouped by dataset split by cluster cells composition plot. Downsampled. + PDF format + + umap_spl_cnd_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_cnd_res_*.png" + doc: | + Split by grouping condition clustered cells UMAP. + PNG format + + umap_spl_cnd_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_cnd_res_*.pdf" + doc: | + Split by grouping condition clustered cells UMAP. + PDF format + + cmp_gr_clst_spl_cnd_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_clst_spl_cnd_res_*.png" + doc: | + Grouped by cluster split by condition cells composition plot. Downsampled. + PNG format + + cmp_gr_clst_spl_cnd_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_clst_spl_cnd_res_*.pdf" + doc: | + Grouped by cluster split by condition cells composition plot. Downsampled. + PDF format + + cmp_gr_cnd_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_cnd_spl_clst_res_*.png" + doc: | + Grouped by condition split by cluster cells composition plot. Downsampled. + PNG format + + cmp_gr_cnd_spl_clst_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_cnd_spl_clst_res_*.pdf" + doc: | + Grouped by condition split by cluster cells composition plot. Downsampled. + PDF format + + umap_spl_ph_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_ph_res_*.png" + doc: | + Split by cell cycle phase clustered cells UMAP. + PNG format + + umap_spl_ph_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_ph_res_*.pdf" + doc: | + Split by cell cycle phase clustered cells UMAP. + PDF format + + cmp_gr_ph_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_cmp_gr_ph_spl_idnt.png" + doc: | + Grouped by cell cycle phase split by dataset cells composition plot. Downsampled. + PNG format + + cmp_gr_ph_spl_idnt_plot_pdf: + type: File? + outputBinding: + glob: "*_cmp_gr_ph_spl_idnt.pdf" + doc: | + Grouped by cell cycle phase split by dataset cells composition plot. Downsampled. + PDF format + + cmp_gr_ph_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_ph_spl_clst_res_*.png" + doc: | + Grouped by cell cycle phase split by cluster cells composition plot. Downsampled. + PNG format + + cmp_gr_ph_spl_clst_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_ph_spl_clst_res_*.pdf" + doc: | + Grouped by cell cycle phase split by cluster cells composition plot. Downsampled. + PDF format + + xpr_avg_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_avg_res_*.png" + doc: | + Log normalized scaled average gene expression per cluster. + PNG format + + xpr_avg_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_avg_res_*.pdf" + doc: | + Log normalized scaled average gene expression per cluster. + PDF format + + xpr_per_cell_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_[!sgnl_]*.png" + doc: | + Log normalized gene expression on cells UMAP. + PNG format + + xpr_per_cell_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_[!sgnl_]*.pdf" + doc: | + Log normalized gene expression on cells UMAP. + PDF format + + xpr_per_cell_sgnl_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_sgnl_*.png" + doc: | + Log normalized gene expression density on cells UMAP. + PNG format + + xpr_per_cell_sgnl_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_sgnl_*.pdf" + doc: | + Log normalized gene expression density on cells UMAP. + PDF format + + xpr_dnst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_dnst_res_*.png" + doc: | + Log normalized gene expression density per cluster. + PNG format + + xpr_dnst_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_dnst_res_*.pdf" + doc: | + Log normalized gene expression density per cluster. + PDF format + + xpr_htmp_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_htmp_res_*.png" + doc: | + Normalized gene expression heatmap grouped by cluster. + PNG format + + xpr_htmp_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_htmp_res_*.pdf" + doc: | + Normalized gene expression heatmap grouped by cluster. + PDF format + + gene_markers_tsv: + type: File? + outputBinding: + glob: "*_gene_markers.tsv" + doc: | + Differentially expressed genes between each pair of clusters for all resolutions. + TSV format + + ucsc_cb_config_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser" + doc: | + Directory with UCSC Cellbrowser configuration data. + + ucsc_cb_html_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser/html_data" + doc: | + Directory with UCSC Cellbrowser html data. + + ucsc_cb_html_file: + type: File? + outputBinding: + glob: "*_cellbrowser/html_data/index.html" + doc: | + HTML index file from the directory with UCSC Cellbrowser html data. + + seurat_data_rds: + type: File + outputBinding: + glob: "*_data.rds" + doc: | + Reduced Seurat data in RDS format + + seurat_data_h5seurat: + type: File? + outputBinding: + glob: "*_data.h5seurat" + doc: | + Reduced Seurat data in h5seurat format + + seurat_data_h5ad: + type: File? + outputBinding: + glob: "*_data.h5ad" + doc: | + Reduced Seurat data in h5ad format + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["sc_rna_cluster.R"] + +stdout: sc_rna_cluster_stdout.log +stderr: sc_rna_cluster_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Single-cell RNA-Seq Cluster Analysis" +s:name: "Single-cell RNA-Seq Cluster Analysis" +s:alternateName: "Clusters single-cell RNA-Seq datasets, identifies gene markers" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-rna-cluster.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell RNA-Seq Cluster Analysis + + Clusters single-cell RNA-Seq datasets, identifies gene markers. + + +s:about: | + usage: sc_rna_cluster.R + [-h] --query QUERY [--dimensions [DIMENSIONS ...]] + [--ametric {euclidean,cosine,manhattan,hamming}] + [--algorithm {louvain,mult-louvain,slm,leiden}] + [--resolution [RESOLUTION ...]] [--genes [GENES ...]] [--diffgenes] + [--logfc LOGFC] [--minpct MINPCT] [--onlypos] + [--testuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] + [--pdf] [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + + Single-cell RNA-Seq Cluster Analysis + + options: + -h, --help show this help message and exit + --query QUERY Path to the RDS file to load Seurat object from. This + file should include genes expression information + stored in the RNA assay, as well as 'pca' and + 'rnaumap' dimensionality reductions applied to that + assay. + --dimensions [DIMENSIONS ...] + Dimensionality to use when constructing nearest- + neighbor graph before clustering (from 1 to 50). If + single value N is provided, use from 1 to N + dimensions. If multiple values are provided, subset to + only selected dimensions. Default: from 1 to 10 + --ametric {euclidean,cosine,manhattan,hamming} + Distance metric used when constructing nearest- + neighbor graph before clustering. Default: euclidean + --algorithm {louvain,mult-louvain,slm,leiden} + Algorithm for modularity optimization when running + clustering. Default: louvain + --resolution [RESOLUTION ...] + Clustering resolution applied to the constructed + nearest-neighbor graph. Can be set as an array but + only the first item from the list will be used for + cluster labels and gene markers in the UCSC Cell + Browser when running with --cbbuild and --diffgenes + parameters. Default: 0.3, 0.5, 1.0 + --genes [GENES ...] Genes of interest to build genes expression plots. + Default: None + --diffgenes Identify differentially expressed genes (putative gene + markers) between each pair of clusters for all + resolutions. Default: false + --logfc LOGFC For putative gene markers identification include only + those genes that on average have log fold change + difference in expression between every tested pair of + clusters not lower than this value. Ignored if '-- + diffgenes' is not set. Default: 0.25 + --minpct MINPCT For putative gene markers identification include only + those genes that are detected in not lower than this + fraction of cells in either of the two tested + clusters. Ignored if '--diffgenes' is not set. + Default: 0.1 + --onlypos For putative gene markers identification return only + positive markers. Ignored if '--diffgenes' is not set. + Default: false + --testuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2} + Statistical test to use for putative gene markers + identification. Ignored if '--diffgenes' is not set. + Default: wilcox + --pdf Export plots in PDF. Default: false + --verbose Print debug information. Default: false + --h5seurat Save Seurat data to h5seurat file. Default: false + --h5ad Save Seurat data to h5ad file. Default: false + --cbbuild Export results to UCSC Cell Browser. Default: false + --output OUTPUT Output prefix. Default: ./sc + --theme {gray,bw,linedraw,light,dark,minimal,classic,void} + Color theme for all generated plots. Default: classic + --cpus CPUS Number of cores/cpus to use. Default: 1 + --memory MEMORY Maximum memory in GB allowed to be shared between the + workers when using multiple --cpus. Default: 32 \ No newline at end of file diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl new file mode 100644 index 00000000..0eedf0a9 --- /dev/null +++ b/tools/sc-rna-da-cells.cwl @@ -0,0 +1,602 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement +- class: EnvVarRequirement + envDef: + R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/sc-tools:v0.0.15 + + +inputs: + + query_data_rds: + type: File + inputBinding: + prefix: "--query" + doc: | + Path to the RDS file to load Seurat object from. This file should include genes + expression information stored in the RNA assay and selected with the --reduction + parameter dimensionality reduction. Additionally, 'rnaumap', and/or 'atacumap', + and/or 'wnnumap' dimensionality reductions should be present. + + reduction: + type: string? + inputBinding: + prefix: "--reduction" + doc: | + Dimensionality reduction to be used for DA analysis. + Default: pca + + dimensions: + type: + - "null" + - int + - int[] + inputBinding: + prefix: "--dimensions" + doc: | + Dimensionality to use when running DA analysis (from 1 to 50). + If single value N is provided, use from 1 to N PCs. If multiple + values are provided, subset to only selected PCs. + Default: from 1 to 10 + + score_vector_knn: + type: + - "null" + - int + - int[] + inputBinding: + prefix: "--knn" + doc: | + Array of k values for kNN graph construction when calculating the + score vector for each cell to represent the DA behavior in the + neighborhood. + Default: calculated based on the cells number + + datasets_metadata: + type: File? + inputBinding: + prefix: "--metadata" + doc: | + Path to the TSV/CSV file to optionally extend Seurat object metadata with + categorical values using samples identities. First column - 'library_id' + should correspond to all unique values from the 'new.ident' column of the + loaded Seurat object. If any of the provided in this file columns are already + present in the Seurat object metadata, they will be overwritten. + Default: no extra metadata is added + + splitby: + type: string + inputBinding: + prefix: "--splitby" + doc: | + Column from the Seurat object metadata to split cells into two groups + to run --second vs --first DA analysis. May include columns from the + extra metadata added with --metadata parameter. + + first_cond: + type: string + inputBinding: + prefix: "--first" + doc: | + Value from the Seurat object metadata column set with --splitby to define + the first group of cells for DA analysis. + + second_cond: + type: string + inputBinding: + prefix: "--second" + doc: | + Value from the Seurat object metadata column set with --splitby to define + the second group of cells for DA analysis. + + resolution: + type: + - "null" + - float + - float[] + inputBinding: + prefix: "--resolution" + doc: | + Clustering resolution applied to DA cells to identify DA cells populations. + Can be set as an array. + Default: 0.01, 0.03, 0.05 + + ranges: + type: + - "null" + - float[] + inputBinding: + prefix: "--ranges" + doc: | + DA scores ranges for to filter out not significant cells. + Default: calculated based on the permutation test + + export_pdf_plots: + type: boolean? + inputBinding: + prefix: "--pdf" + doc: | + Export plots in PDF. + Default: false + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + inputBinding: + prefix: "--theme" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + + verbose: + type: boolean? + inputBinding: + prefix: "--verbose" + doc: | + Print debug information. + Default: false + + export_h5seurat_data: + type: boolean? + inputBinding: + prefix: "--h5seurat" + doc: | + Save Seurat data to h5seurat file. + Default: false + + export_h5ad_data: + type: boolean? + inputBinding: + prefix: "--h5ad" + doc: | + Save Seurat data to h5ad file. + Default: false + + export_ucsc_cb: + type: boolean? + inputBinding: + prefix: "--cbbuild" + doc: | + Export results to UCSC Cell Browser. Default: false + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix. + Default: ./sc + + parallel_memory_limit: + type: int? + inputBinding: + prefix: "--memory" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Default: 32 + + vector_memory_limit: + type: int? + default: 128 + doc: | + Maximum vector memory in GB allowed to be used by R. + Default: 128 + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. + Default: 1 + + +outputs: + + da_perm_plot_png: + type: File? + outputBinding: + glob: "*_da_perm.png" + doc: | + DA scores random permutations plot for second + vs first biological conditions comparison. + PNG format + + da_perm_plot_pdf: + type: File? + outputBinding: + glob: "*_da_perm.pdf" + doc: | + DA scores random permutations plot for second + vs first biological conditions comparison. + PDF format + + umap_rd_rnaumap_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_rd_rnaumap_res_*.png" + doc: | + Clustered DA cells subpopulations UMAP (rnaumap dim. reduction). + PNG format + + umap_rd_rnaumap_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_rd_rnaumap_res_*.pdf" + doc: | + Clustered DA cells subpopulations UMAP (rnaumap dim. reduction). + PDF format + + umap_rd_atacumap_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_rd_atacumap_res_*.png" + doc: | + Clustered DA cells subpopulations UMAP (atacumap dim. reduction). + PNG format + + umap_rd_atacumap_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_rd_atacumap_res_*.pdf" + doc: | + Clustered DA cells subpopulations UMAP (atacumap dim. reduction). + PDF format + + umap_rd_wnnumap_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_rd_wnnumap_res_*.png" + doc: | + Clustered DA cells subpopulations UMAP (wnnumap dim. reduction). + PNG format + + umap_rd_wnnumap_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_rd_wnnumap_res_*.pdf" + doc: | + Clustered DA cells subpopulations UMAP (wnnumap dim. reduction). + PDF format + + umap_spl_cnd_rd_rnaumap_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_cnd_rd_rnaumap_res_*.png" + doc: | + Split by grouping condition clustered DA cells subpopulations UMAP + (rnaumap dim. reduction). + PNG format + + umap_spl_cnd_rd_rnaumap_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_cnd_rd_rnaumap_res_*.pdf" + doc: | + Split by grouping condition clustered DA cells subpopulations UMAP + (rnaumap dim. reduction). + PDF format + + umap_spl_cnd_rd_atacumap_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_cnd_rd_atacumap_res_*.png" + doc: | + Split by grouping condition clustered DA cells subpopulations UMAP + (atacumap dim. reduction). + PNG format + + umap_spl_cnd_rd_atacumap_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_cnd_rd_atacumap_res_*.pdf" + doc: | + Split by grouping condition clustered DA cells subpopulations UMAP + (atacumap dim. reduction). + PDF format + + umap_spl_cnd_rd_wnnumap_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_cnd_rd_wnnumap_res_*.png" + doc: | + Split by grouping condition clustered DA cells subpopulations UMAP + (wnnumap dim. reduction). + PNG format + + umap_spl_cnd_rd_wnnumap_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_cnd_rd_wnnumap_res_*.pdf" + doc: | + Split by grouping condition clustered DA cells subpopulations UMAP + (wnnumap dim. reduction). + PDF format + + umap_spl_idnt_rd_rnaumap_da_scr_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_idnt_rd_rnaumap_da_scr.png" + doc: | + Split by dataset cells UMAP with DA scores for second vs first + biological conditions comparison (rnaumap dim. reduction). + PNG format + + umap_spl_idnt_rd_rnaumap_da_scr_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_idnt_rd_rnaumap_da_scr.pdf" + doc: | + Split by dataset cells UMAP with DA scores for second vs first + biological conditions comparison (rnaumap dim. reduction). + PDF format + + umap_spl_idnt_rd_atacumap_da_scr_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_idnt_rd_atacumap_da_scr.png" + doc: | + Split by dataset cells UMAP with DA scores for second vs first + biological conditions comparison (atacumap dim. reduction). + PNG format + + umap_spl_idnt_rd_atacumap_da_scr_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_idnt_rd_atacumap_da_scr.pdf" + doc: | + Split by dataset cells UMAP with DA scores for second vs first + biological conditions comparison (atacumap dim. reduction). + PDF format + + umap_spl_idnt_rd_wnnumap_da_scr_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_idnt_rd_wnnumap_da_scr.png" + doc: | + Split by dataset cells UMAP with DA scores for second vs first + biological conditions comparison (wnnumap dim. reduction). + PNG format + + umap_spl_idnt_rd_wnnumap_da_scr_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_idnt_rd_wnnumap_da_scr.pdf" + doc: | + Split by dataset cells UMAP with DA scores for second vs first + biological conditions comparison (wnnumap dim. reduction). + PDF format + + ucsc_cb_config_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser" + doc: | + Directory with UCSC Cellbrowser configuration data. + + ucsc_cb_html_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser/html_data" + doc: | + Directory with UCSC Cellbrowser html data. + + ucsc_cb_html_file: + type: File? + outputBinding: + glob: "*_cellbrowser/html_data/index.html" + doc: | + HTML index file from the directory with UCSC Cellbrowser html data. + + seurat_data_rds: + type: File + outputBinding: + glob: "*_data.rds" + doc: | + Reduced Seurat data in RDS format + + seurat_data_h5seurat: + type: File? + outputBinding: + glob: "*_data.h5seurat" + doc: | + Reduced Seurat data in h5seurat format + + seurat_data_h5ad: + type: File? + outputBinding: + glob: "*_data.h5ad" + doc: | + Reduced Seurat data in h5ad format + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["sc_rna_da_cells.R"] + +stdout: sc_rna_da_cells_stdout.log +stderr: sc_rna_da_cells_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Single-cell Differential Abundance Analysis" +s:name: "Single-cell Differential Abundance Analysis" +s:alternateName: "Detects cell subpopulations with differential abundance between datasets split by biological condition" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-rna-da-cells.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell Differential Abundance Analysis + + Detects cell subpopulations with differential abundance + between datasets split by biological condition. + + +s:about: | + usage: sc_rna_da_cells.R + [-h] --query QUERY [--reduction REDUCTION] + [--dimensions [DIMENSIONS ...]] [--knn [KNN ...]] [--metadata METADATA] + --splitby SPLITBY --first FIRST --second SECOND + [--resolution [RESOLUTION ...]] [--ranges RANGES RANGES] [--pdf] + [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + + Single-cell Differential Abundance Analysis + + options: + -h, --help show this help message and exit + --query QUERY Path to the RDS file to load Seurat object from. This + file should include genes expression information + stored in the RNA assay and selected with the + --reduction parameter dimensionality reduction. + Additionally, 'rnaumap', and/or 'atacumap', and/or + 'wnnumap' dimensionality reductions should be present. + --reduction REDUCTION + Dimensionality reduction to be used for DA analysis. + Default: pca + --dimensions [DIMENSIONS ...] + Dimensionality to use when running DA analysis (from 1 + to 50). If single value N is provided, use from 1 to N + PCs. If multiple values are provided, subset to only + selected PCs. Default: from 1 to 10 + --knn [KNN ...] Array of k values for kNN graph construction when + calculating the score vector for each cell to + represent the DA behavior in the neighborhood. + Default: calculated based on the cells number + --metadata METADATA Path to the TSV/CSV file to optionally extend Seurat + object metadata with categorical values using samples + identities. First column - 'library_id' should + correspond to all unique values from the 'new.ident' + column of the loaded Seurat object. If any of the + provided in this file columns are already present in + the Seurat object metadata, they will be overwritten. + Default: no extra metadata is added + --splitby SPLITBY Column from the Seurat object metadata to split cells + into two groups to run --second vs --first DA + analysis. May include columns from the extra metadata + added with --metadata parameter. + --first FIRST Value from the Seurat object metadata column set with + --splitby to define the first group of cells for DA + analysis. + --second SECOND Value from the Seurat object metadata column set with + --splitby to define the second group of cells for DA + analysis. + --resolution [RESOLUTION ...] + Clustering resolution applied to DA cells to identify + DA cells populations. Can be set as an array. Default: + 0.01, 0.03, 0.05 + --ranges RANGES RANGES + DA scores ranges for to filter out not significant + cells. Default: calculated based on the permutation + test + --pdf Export plots in PDF. Default: false + --verbose Print debug information. Default: false + --h5seurat Save Seurat data to h5seurat file. Default: false + --h5ad Save Seurat data to h5ad file. Default: false + --cbbuild Export results to UCSC Cell Browser. Default: false + --output OUTPUT Output prefix. Default: ./sc + --theme {gray,bw,linedraw,light,dark,minimal,classic,void} + Color theme for all generated plots. Default: classic + --cpus CPUS Number of cores/cpus to use. Default: 1 + --memory MEMORY Maximum memory in GB allowed to be shared between the + workers when using multiple --cpus. Default: 32 \ No newline at end of file diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl new file mode 100644 index 00000000..9ff80079 --- /dev/null +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -0,0 +1,743 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement +- class: EnvVarRequirement + envDef: + R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/sc-tools:v0.0.15 + + +inputs: + + query_data_rds: + type: File + inputBinding: + prefix: "--query" + doc: | + Path to the RDS file to load Seurat object from. This file should include genes + expression information stored in the RNA assay. Additionally, 'rnaumap', and/or + 'atacumap', and/or 'wnnumap' dimensionality reductions should be present. + + datasets_metadata: + type: File? + inputBinding: + prefix: "--metadata" + doc: | + Path to the TSV/CSV file to optionally extend Seurat object metadata with + categorical values using samples identities. First column - 'library_id' + should correspond to all unique values from the 'new.ident' column of the + loaded Seurat object. If any of the provided in this file columns are already + present in the Seurat object metadata, they will be overwritten. Default: no + extra metadata is added + + splitby: + type: string + inputBinding: + prefix: "--splitby" + doc: | + Column from the Seurat object metadata to split datasets into two groups + to run --second vs --first pseudobulk DE analysis, i.e., calculate log2FC. + May be one of the columns from the extra metadata added with --metadata + parameter. Provided value should group the datasets, not cells, therefore + do not use a column with clustering results. + + first_cond: + type: string + inputBinding: + prefix: "--first" + doc: | + Value from the Seurat object metadata column set with --splitby to define the + first group of datasets for pseudobulk DE analysis. + + second_cond: + type: string + inputBinding: + prefix: "--second" + doc: | + Value from the Seurat object metadata column set with --splitby to define the + second group of datasets for pseudobulk DE analysis. + + batchby: + type: string? + inputBinding: + prefix: "--batchby" + doc: | + Column from the Seurat object metadata to group datasets into batches. It will be used + as a factor variable to model batch effect when running pseudobulk DE analysis (makes + design formula look like ~splitby+batchby). May be one of the columns from the extra + metadata added with --metadata parameter. Provided value should batch the datasets, not + cells, therefore do not use a column with clustering results. Default: do not model + batch effect. + + groupby: + type: string? + inputBinding: + prefix: "--groupby" + doc: | + Column from the Seurat object metadata to group cells for optional subsetting + when combined with --subset parameter. May be one of the columns from the extra + metadata added with --metadata parameter. Ignored if --subset is not set. Provided + value defines the groups of cells, therefore any metadata column, including the + clustering results, may be used. Default: do not subset, run pseudobulk DE analysis + for all cells jointly + + subset: + type: + - "null" + - string + - string[] + inputBinding: + prefix: "--subset" + doc: | + Value(s) from the column set with --groupby parameter to subset cells + before running pseudobulk DE analysis. If multiple values are provided + run analysis jointly for selected groups of cells. Ignored if --groupby + is not set. Default: do not subset, run pseudobulk DE analysis for all + cells jointly + + lrt: + type: boolean? + inputBinding: + prefix: "--lrt" + doc: | + Use LRT instead of the pair-wise Wald test. If --batchby is not provided + use ~1 as a reduced formula, otherwise ~batchby. Default: use Wald test + + maximum_padj: + type: float? + inputBinding: + prefix: "--padj" + doc: | + In the exploratory visualization part of the analysis output only features + with adjusted P-value not bigger than this value. Default: 0.05 + + genes_of_interest: + type: + - "null" + - string + - string[] + inputBinding: + prefix: "--genes" + doc: | + Genes of interest to label on the generated plots. Default: top 10 genes + with the highest and the lowest log2FC expression values. + + exclude_pattern: + type: string? + inputBinding: + prefix: "--exclude" + doc: | + Regex pattern to identify and exclude non-coding RNA genes from the pseudobulk + DE analysis (not case-sensitive). If any of such genes were provided in the --genes + parameter, they will be excluded from there as well. + Default: use all genes + + normalization_method: + type: + - "null" + - type: enum + symbols: + - "vst" + - "rlog" + inputBinding: + prefix: "--norm" + doc: | + Read counts normalization for the exploratory visualization part of the analysis. + Use 'vst' for medium-to-large datasets (n > 30) and 'rlog' for small datasets + (n < 30), when there is a wide range of sequencing depth across samples. + Default: rlog + + remove: + type: boolean? + inputBinding: + prefix: "--remove" + doc: | + Remove batch effect when generating normalized read counts for the exploratory + visualization part of the analysis. Ignored if --batchby is not provided. + Default: do not remove batch effect from normalized read counts. + + cluster_method: + type: + - "null" + - type: enum + symbols: + - "row" + - "column" + - "both" + inputBinding: + prefix: "--cluster" + doc: | + Hopach clustering method to be run on normalized read counts for the + exploratory visualization part of the analysis. Default: do not run + clustering + + row_distance: + type: + - "null" + - type: enum + symbols: + - "cosangle" + - "abscosangle" + - "euclid" + - "abseuclid" + - "cor" + - "abscor" + inputBinding: + prefix: "--rowdist" + doc: | + Distance metric for HOPACH row clustering. Ignored if --cluster is set + to column or not provided. Default: cosangle + + column_distance: + type: + - "null" + - type: enum + symbols: + - "cosangle" + - "abscosangle" + - "euclid" + - "abseuclid" + - "cor" + - "abscor" + inputBinding: + prefix: "--columndist" + doc: | + Distance metric for HOPACH column clustering. Ignored if --cluster is set + to row or not provided. Default: euclid + + center_row: + type: boolean? + inputBinding: + prefix: "--center" + doc: | + Apply mean centering for gene expression prior to running + clustering by row. Ignored if --cluster is set to column or + not provided. Default: do not centered + + export_pdf_plots: + type: boolean? + inputBinding: + prefix: "--pdf" + doc: | + Export plots in PDF. + Default: false + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + inputBinding: + prefix: "--theme" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + + verbose: + type: boolean? + inputBinding: + prefix: "--verbose" + doc: | + Print debug information. + Default: false + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix. + Default: ./sc + + parallel_memory_limit: + type: int? + inputBinding: + prefix: "--memory" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Default: 32 + + vector_memory_limit: + type: int? + default: 128 + doc: | + Maximum vector memory in GB allowed to be used by R. + Default: 128 + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. + Default: 1 + + +outputs: + + umap_rd_rnaumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_rd_rnaumap.png" + doc: | + Cells UMAP split by selected biological condition, optionally + subsetted to the specific cluster or cell type (rnaumap dim. + reduction). + PNG format + + umap_rd_rnaumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_rd_rnaumap.pdf" + doc: | + Cells UMAP split by selected biological condition, optionally + subsetted to the specific cluster or cell type (rnaumap dim. + reduction). + PDF format + + umap_rd_atacumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_rd_atacumap.png" + doc: | + Cells UMAP split by selected biological condition, optionally + subsetted to the specific cluster or cell type (atacumap dim. + reduction). + PNG format + + umap_rd_atacumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_rd_atacumap.pdf" + doc: | + Cells UMAP split by selected biological condition, optionally + subsetted to the specific cluster or cell type (atacumap dim. + reduction). + PDF format + + umap_rd_wnnumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_rd_wnnumap.png" + doc: | + Cells UMAP split by selected biological condition, optionally + subsetted to the specific cluster or cell type (wnnumap dim. + reduction). + PNG format + + umap_rd_wnnumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_rd_wnnumap.pdf" + doc: | + Cells UMAP split by selected biological condition, optionally + subsetted to the specific cluster or cell type (wnnumap dim. + reduction). + PDF format + + mds_plot_html: + type: File? + outputBinding: + glob: "*_mds_plot.html" + doc: | + MDS plot of normalized counts. Optionally batch corrected + if --remove was set to True. + HTML format + + pca_1_2_plot_png: + type: File? + outputBinding: + glob: "*_pca_1_2.png" + doc: | + Normalized counts PCA (PC1 and PC2) subsetted to all DE genes regardless + of Padj, optionally batch corrected by the selected criteria. + PNG format + + pca_1_2_plot_pdf: + type: File? + outputBinding: + glob: "*_pca_1_2.pdf" + doc: | + Normalized counts PCA (PC1 and PC2) subsetted to all DE genes regardless + of Padj, optionally batch corrected by the selected criteria. + PDF format + + pca_2_3_plot_png: + type: File? + outputBinding: + glob: "*_pca_2_3.png" + doc: | + Normalized counts PCA (PC2 and PC3) subsetted to all DE genes regardless + of Padj, optionally batch corrected by the selected criteria. + PNG format + + pca_2_3_plot_pdf: + type: File? + outputBinding: + glob: "*_pca_2_3.pdf" + doc: | + Normalized counts PCA (PC2 and PC3) subsetted to all DE genes regardless + of Padj, optionally batch corrected by the selected criteria. + PDF format + + dxpr_vlcn_plot_png: + type: File? + outputBinding: + glob: "*_dxpr_vlcn.png" + doc: | + Volcano plot of differentially expressed genes. Highlighed genes are either + provided by user or top 10 genes with the highest log2FC values. The direction + of comparison is defined by --second vs --first groups of cells optionally + subsetted to the specific cluster or cell type and coerced to the pseudobulk + RNA-Seq samples. + PNG format + + dxpr_vlcn_plot_pdf: + type: File? + outputBinding: + glob: "*_dxpr_vlcn.pdf" + doc: | + Volcano plot of differentially expressed genes. Highlighed genes are either + provided by user or top 10 genes with the highest log2FC values. The direction + of comparison is defined by --second vs --first groups of cells optionally + subsetted to the specific cluster or cell type and coerced to the pseudobulk + RNA-Seq samples. + PDF format + + xpr_dnst_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_dnst_*.png" + doc: | + Log normalized gene expression density per dataset optionally subsetted to the + specific cluster or cell type. + PNG format + + xpr_dnst_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_dnst_*.pdf" + doc: | + Log normalized gene expression density per dataset optionally subsetted to the + specific cluster or cell type. + PDF format + + xpr_per_cell_rd_rnaumap_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_rd_rnaumap_*.png" + doc: | + Log normalized gene expression on cells UMAP per dataset optionally subsetted + to the specific cluster or cell type (rnaumap dim. reduction). + PNG format + + xpr_per_cell_rd_rnaumap_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_rd_rnaumap_*.pdf" + doc: | + Log normalized gene expression on cells UMAP per dataset optionally subsetted + to the specific cluster or cell type (rnaumap dim. reduction). + PDF format + + xpr_per_cell_rd_atacumap_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_rd_atacumap_*.png" + doc: | + Log normalized gene expression on cells UMAP per dataset optionally subsetted + to the specific cluster or cell type (atacumap dim. reduction). + PNG format + + xpr_per_cell_rd_atacumap_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_rd_atacumap_*.pdf" + doc: | + Log normalized gene expression on cells UMAP per dataset optionally subsetted + to the specific cluster or cell type (atacumap dim. reduction). + PDF format + + xpr_per_cell_rd_wnnumap_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_rd_wnnumap_*.png" + doc: | + Log normalized gene expression on cells UMAP per dataset optionally subsetted + to the specific cluster or cell type (wnnumap dim. reduction). + PNG format + + xpr_per_cell_rd_wnnumap_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_rd_wnnumap_*.pdf" + doc: | + Log normalized gene expression on cells UMAP per dataset optionally subsetted + to the specific cluster or cell type (wnnumap dim. reduction). + PDF format + + xpr_htmp_plot_png: + type: File? + outputBinding: + glob: "*_xpr_htmp.png" + doc: | + Normalized gene expression heatmap optionally subsetted + to the specific cluster or cell type. + PNG format + + xpr_htmp_plot_pdf: + type: File? + outputBinding: + glob: "*_xpr_htmp.pdf" + doc: | + Normalized gene expression heatmap optionally subsetted + to the specific cluster or cell type. + PDF format + + diff_expr_genes: + type: File? + outputBinding: + glob: "*_de_genes.tsv" + doc: | + Differentially expressed genes. + TSV format + + read_counts_gct: + type: File? + outputBinding: + glob: "*_norm_read_counts.gct" + doc: | + GSEA compatible normalized counts, optionally, batch corrected. + GCT format + + phenotypes_cls: + type: File? + outputBinding: + glob: "*_phenotypes.cls" + doc: | + GSEA compatible phenotypes file defined based on --splitby, --first, + and --second parameters. + CLS format + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["sc_rna_de_pseudobulk.R"] + +stdout: sc_rna_de_pseudobulk_stdout.log +stderr: sc_rna_de_pseudobulk_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Single-cell Pseudobulk Differential Expression Analysis Between Datasets" +s:name: "Single-cell Pseudobulk Differential Expression Analysis Between Datasets" +s:alternateName: "Identifies differentially expressed genes between groups of cells coerced to pseudobulk datasets" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-rna-de-pseudobulk.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell Pseudobulk Differential Expression Analysis Between Datasets + + Identifies differentially expressed genes between groups + of cells coerced to pseudobulk datasets. + + +s:about: | + usage: sc_rna_de_pseudobulk.R + [-h] --query QUERY [--metadata METADATA] --splitby SPLITBY --first + FIRST --second SECOND [--batchby BATCHBY] [--groupby GROUPBY] + [--subset [SUBSET ...]] [--lrt] [--padj PADJ] [--genes [GENES ...]] + [--exclude EXCLUDE] [--norm {vst,rlog}] [--remove] + [--cluster {row,column,both}] + [--rowdist {cosangle,abscosangle,euclid,abseuclid,cor,abscor}] + [--columndist {cosangle,abscosangle,euclid,abseuclid,cor,abscor}] + [--center] [--pdf] [--verbose] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + + Single-cell Pseudobulk Differential Expression Analysis Between Datasets + + options: + -h, --help show this help message and exit + --query QUERY Path to the RDS file to load Seurat object from. This + file should include genes expression information + stored in the RNA assay. Additionally, 'rnaumap', + and/or 'atacumap', and/or 'wnnumap' dimensionality + reductions should be present. + --metadata METADATA Path to the TSV/CSV file to optionally extend Seurat + object metadata with categorical values using samples + identities. First column - 'library_id' should + correspond to all unique values from the 'new.ident' + column of the loaded Seurat object. If any of the + provided in this file columns are already present in + the Seurat object metadata, they will be overwritten. + Default: no extra metadata is added + --splitby SPLITBY Column from the Seurat object metadata to split + datasets into two groups to run --second vs --first + pseudobulk DE analysis, i.e., calculate log2FC. May be + one of the columns from the extra metadata added with + --metadata parameter. Provided value should group the + datasets, not cells, therefore do not use a column + with clustering results. + --first FIRST Value from the Seurat object metadata column set with + --splitby to define the first group of datasets for + pseudobulk DE analysis. + --second SECOND Value from the Seurat object metadata column set with + --splitby to define the second group of datasets for + pseudobulk DE analysis. + --batchby BATCHBY Column from the Seurat object metadata to group + datasets into batches. It will be used as a factor + variable to model batch effect when running pseudobulk + DE analysis (makes design formula look like + ~splitby+batchby). May be one of the columns from the + extra metadata added with --metadata parameter. + Provided value should batch the datasets, not cells, + therefore do not use a column with clustering results. + Default: do not model batch effect. + --groupby GROUPBY Column from the Seurat object metadata to group cells + for optional subsetting when combined with --subset + parameter. May be one of the columns from the extra + metadata added with --metadata parameter. Ignored if + --subset is not set. Provided value defines the groups + of cells, therefore any metadata column, including the + clustering results, may be used. Default: do not + subset, run pseudobulk DE analysis for all cells + jointly + --subset [SUBSET ...] + Value(s) from the column set with --groupby parameter + to subset cells before running pseudobulk DE analysis. + If multiple values are provided run analysis jointly + for selected groups of cells. Ignored if --groupby is + not set. Default: do not subset, run pseudobulk DE + analysis for all cells jointly + --lrt Use LRT instead of the pair-wise Wald test. If + --batchby is not provided use ~1 as a reduced formula, + otherwise ~batchby. Default: use Wald test + --padj PADJ In the exploratory visualization part of the analysis + output only features with adjusted P-value not bigger + than this value. Default: 0.05 + --genes [GENES ...] Genes of interest to label on the generated plots. + Default: top 10 genes with the highest and the lowest + log2FC expression values. + --exclude EXCLUDE Regex pattern to identify and exclude non-coding RNA + genes from the pseudobulk DE analysis (not case- + sensitive). If any of such genes were provided in the + --genes parameter, they will be excluded from there as + well. Default: use all genes + --norm {vst,rlog} Read counts normalization for the exploratory + visualization part of the analysis. Use 'vst' for + medium-to-large datasets (n > 30) and 'rlog' for small + datasets (n < 30), when there is a wide range of + sequencing depth across samples. Default: rlog + --remove Remove batch effect when generating normalized read + counts for the exploratory visualization part of the + analysis. Ignored if --batchby is not provided. + Default: do not remove batch effect from normalized + read counts. + --cluster {row,column,both} + Hopach clustering method to be run on normalized read + counts for the exploratory visualization part of the + analysis. Default: do not run clustering + --rowdist {cosangle,abscosangle,euclid,abseuclid,cor,abscor} + Distance metric for HOPACH row clustering. Ignored if + --cluster is set to column or not provided. Default: + cosangle + --columndist {cosangle,abscosangle,euclid,abseuclid,cor,abscor} + Distance metric for HOPACH column clustering. Ignored + if --cluster is set to row or not provided. Default: + euclid + --center Apply mean centering for gene expression prior to + running clustering by row. Ignored if --cluster is set + to column or not provided. Default: do not centered + --pdf Export plots in PDF. Default: false + --verbose Print debug information. Default: false + --output OUTPUT Output prefix. Default: ./sc + --theme {gray,bw,linedraw,light,dark,minimal,classic,void} + Color theme for all generated plots. Default: classic + --cpus CPUS Number of cores/cpus to use. Default: 1 + --memory MEMORY Maximum memory in GB allowed to be shared between the + workers when using multiple --cpus. Default: 32 \ No newline at end of file diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl new file mode 100644 index 00000000..7f0d0281 --- /dev/null +++ b/tools/sc-rna-filter.cwl @@ -0,0 +1,864 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement +- class: InitialWorkDirRequirement + listing: + - entryname: dummy_metadata.csv + entry: | + library_id + Experiment +- class: EnvVarRequirement + envDef: + R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/sc-tools:v0.0.15 + + +inputs: + + feature_bc_matrices_folder: + type: Directory + inputBinding: + prefix: "--mex" + doc: | + Path to the folder with feature-barcode matrix from Cell Ranger Count/Aggregate + experiment in MEX format. + + aggregation_metadata: + type: File? + doc: | + Path to the metadata TSV/CSV file to set the datasets identities. If '--mex' points to + the Cell Ranger Aggregate outputs, the aggregation.csv file can be used. If input is not + provided, the default dummy_metadata.csv will be used instead. + + grouping_data: + type: File? + inputBinding: + prefix: "--grouping" + doc: | + Path to the TSV/CSV file to define datasets grouping. + First column - 'library_id' with the values and order + that correspond to the 'library_id' column from the ' + --identity' file, second column 'condition'. + Default: each dataset is assigned to its own group. + + barcodes_data: + type: File? + inputBinding: + prefix: "--barcodes" + doc: | + Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata be selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. + Default: all cells used, no extra metadata is added + + rna_minimum_cells: + type: int? + inputBinding: + prefix: "--rnamincells" + doc: | + Include only genes detected in at least this many cells. + Default: 5 (applied to all datasets) + + minimum_genes: + type: + - "null" + - int + - int[] + inputBinding: + prefix: "--mingenes" + doc: | + Include cells where at least this many genes are detected. If multiple values + provided, each of them will be applied to the correspondent dataset from the + '--mex' input based on the '--identity' file. + Default: 250 (applied to all datasets) + + maximum_genes: + type: + - "null" + - int + - int[] + inputBinding: + prefix: "--maxgenes" + doc: | + Include cells with the number of genes not bigger than this value. If multiple + values provided, each of them will be applied to the correspondent dataset from + the '--mex' input based on the '--identity' file. + Default: 5000 (applied to all datasets) + + rna_minimum_umi: + type: + - "null" + - int + - int[] + inputBinding: + prefix: "--rnaminumi" + doc: | + Include cells where at least this many UMI (transcripts) are detected. + If multiple values provided, each of them will be applied to the correspondent + dataset from the '--mex' input based on the '--identity' file. + Default: 500 (applied to all datasets) + + minimum_novelty_score: + type: + - "null" + - float + - float[] + inputBinding: + prefix: "--minnovelty" + doc: | + Include cells with the novelty score not lower than this value, calculated for + as log10(genes)/log10(UMI). If multiple values provided, each of them will + be applied to the correspondent dataset from the '--mex' input based on the + '--identity' file. + Default: 0.8 (applied to all datasets) + + mito_pattern: + type: string? + inputBinding: + prefix: "--mitopattern" + doc: | + Regex pattern to identify mitochondrial genes. + Default: '^mt-|^MT-' + + maximum_mito_perc: + type: float? + inputBinding: + prefix: "--maxmt" + doc: | + Include cells with the percentage of transcripts mapped to mitochondrial + genes not bigger than this value. + Default: 5 (applied to all datasets) + + export_pdf_plots: + type: boolean? + inputBinding: + prefix: "--pdf" + doc: | + Export plots in PDF. + Default: false + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + inputBinding: + prefix: "--theme" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + + verbose: + type: boolean? + inputBinding: + prefix: "--verbose" + doc: | + Print debug information. + Default: false + + export_h5seurat_data: + type: boolean? + inputBinding: + prefix: "--h5seurat" + doc: | + Save Seurat data to h5seurat file. + Default: false + + export_h5ad_data: + type: boolean? + inputBinding: + prefix: "--h5ad" + doc: | + Save Seurat data to h5ad file. + Default: false + + export_ucsc_cb: + type: boolean? + inputBinding: + prefix: "--cbbuild" + doc: | + Export results to UCSC Cell Browser. Default: false + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix. + Default: ./sc + + parallel_memory_limit: + type: int? + inputBinding: + prefix: "--memory" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Default: 32 + + vector_memory_limit: + type: int? + default: 128 + doc: | + Maximum vector memory in GB allowed to be used by R. + Default: 128 + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. + Default: 1 + + +outputs: + + raw_1_2_qc_mtrcs_pca_plot_png: + type: File? + outputBinding: + glob: "*_raw_1_2_qc_mtrcs_pca.png" + doc: | + PC1 and PC2 from the QC metrics PCA (not filtered). + PNG format + + raw_1_2_qc_mtrcs_pca_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_1_2_qc_mtrcs_pca.pdf" + doc: | + PC1 and PC2 from the QC metrics PCA (not filtered). + PDF format + + raw_2_3_qc_mtrcs_pca_plot_png: + type: File? + outputBinding: + glob: "*_raw_2_3_qc_mtrcs_pca.png" + doc: | + PC2 and PC3 from the QC metrics PCA (not filtered). + PNG format + + raw_2_3_qc_mtrcs_pca_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_2_3_qc_mtrcs_pca.pdf" + doc: | + PC2 and PC3 from the QC metrics PCA (not filtered). + PDF format + + raw_cells_count_plot_png: + type: File? + outputBinding: + glob: "*_raw_cells_count.png" + doc: | + Number of cells per dataset (not filtered). + PNG format + + raw_cells_count_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_cells_count.pdf" + doc: | + Number of cells per dataset (not filtered). + PDF format + + raw_umi_dnst_plot_png: + type: File? + outputBinding: + glob: "*_raw_umi_dnst.png" + doc: | + UMI per cell density (not filtered). + PNG format + + raw_umi_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_umi_dnst.pdf" + doc: | + UMI per cell density (not filtered). + PDF format + + raw_gene_dnst_plot_png: + type: File? + outputBinding: + glob: "*_raw_gene_dnst.png" + doc: | + Genes per cell density (not filtered). + PNG format + + raw_gene_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_gene_dnst.pdf" + doc: | + Genes per cell density (not filtered). + PDF format + + raw_gene_umi_corr_plot_png: + type: File? + outputBinding: + glob: "*_raw_gene_umi_corr.png" + doc: | + Genes vs UMI per cell correlation (not filtered). + PNG format + + raw_gene_umi_corr_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_gene_umi_corr.pdf" + doc: | + Genes vs UMI per cell correlation (not filtered). + PDF format + + raw_mito_dnst_plot_png: + type: File? + outputBinding: + glob: "*_raw_mito_dnst.png" + doc: | + Percentage of transcripts mapped to mitochondrial genes per cell density (not filtered). + PNG format + + raw_mito_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_mito_dnst.pdf" + doc: | + Percentage of transcripts mapped to mitochondrial genes per cell density (not filtered). + PDF format + + raw_nvlt_dnst_plot_png: + type: File? + outputBinding: + glob: "*_raw_nvlt_dnst.png" + doc: | + Novelty score per cell density (not filtered). + PNG format + + raw_nvlt_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_nvlt_dnst.pdf" + doc: | + Novelty score per cell density (not filtered). + PDF format + + raw_qc_mtrcs_dnst_plot_png: + type: File? + outputBinding: + glob: "*_raw_qc_mtrcs_dnst.png" + doc: | + QC metrics per cell density (not filtered). + PNG format + + raw_qc_mtrcs_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_qc_mtrcs_dnst.pdf" + doc: | + QC metrics per cell density (not filtered). + PDF format + + raw_umi_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_raw_umi_dnst_spl_cnd.png" + doc: | + Split by grouping condition UMI per cell density (not filtered). + PNG format + + raw_umi_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_umi_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition UMI per cell density (not filtered). + PDF format + + raw_gene_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_raw_gene_dnst_spl_cnd.png" + doc: | + Split by grouping condition genes per cell density (not filtered). + PNG format + + raw_gene_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_gene_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition genes per cell density (not filtered). + PDF format + + raw_mito_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_raw_mito_dnst_spl_cnd.png" + doc: | + Split by grouping condition the percentage of transcripts mapped + to mitochondrial genes per cell density (not filtered). + PNG format + + raw_mito_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_mito_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition the percentage of transcripts mapped + to mitochondrial genes per cell density (not filtered). + PDF format + + raw_nvlt_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_raw_nvlt_dnst_spl_cnd.png" + doc: | + Split by grouping condition the novelty score per cell density (not filtered). + PNG format + + raw_nvlt_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_nvlt_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition the novelty score per cell density (not filtered). + PDF format + + fltr_1_2_qc_mtrcs_pca_plot_png: + type: File? + outputBinding: + glob: "*_fltr_1_2_qc_mtrcs_pca.png" + doc: | + PC1 and PC2 from the QC metrics PCA (filtered). + PNG format + + fltr_1_2_qc_mtrcs_pca_plot_pdf: + type: File? + outputBinding: + glob: "*_fltr_1_2_qc_mtrcs_pca.pdf" + doc: | + PC1 and PC2 from the QC metrics PCA (filtered). + PDF format + + fltr_2_3_qc_mtrcs_pca_plot_png: + type: File? + outputBinding: + glob: "*_fltr_2_3_qc_mtrcs_pca.png" + doc: | + PC2 and PC3 from the QC metrics PCA (filtered). + PNG format + + fltr_2_3_qc_mtrcs_pca_plot_pdf: + type: File? + outputBinding: + glob: "*_fltr_2_3_qc_mtrcs_pca.pdf" + doc: | + PC2 and PC3 from the QC metrics PCA (filtered). + PDF format + + fltr_cells_count_plot_png: + type: File? + outputBinding: + glob: "*_fltr_cells_count.png" + doc: | + Number of cells per dataset (filtered). + PNG format + + fltr_cells_count_plot_pdf: + type: File? + outputBinding: + glob: "*_fltr_cells_count.pdf" + doc: | + Number of cells per dataset (filtered). + PDF format + + fltr_umi_dnst_plot_png: + type: File? + outputBinding: + glob: "*_fltr_umi_dnst.png" + doc: | + UMI per cell density (filtered). + PNG format + + fltr_umi_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_fltr_umi_dnst.pdf" + doc: | + UMI per cell density (filtered). + PDF format + + fltr_gene_dnst_plot_png: + type: File? + outputBinding: + glob: "*_fltr_gene_dnst.png" + doc: | + Genes per cell density (filtered). + PNG format + + fltr_gene_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_fltr_gene_dnst.pdf" + doc: | + Genes per cell density (filtered). + PDF format + + fltr_gene_umi_corr_plot_png: + type: File? + outputBinding: + glob: "*_fltr_gene_umi_corr.png" + doc: | + Genes vs UMI per cell correlation (filtered). + PNG format + + fltr_gene_umi_corr_plot_pdf: + type: File? + outputBinding: + glob: "*_fltr_gene_umi_corr.pdf" + doc: | + Genes vs UMI per cell correlation (filtered). + PDF format + + fltr_mito_dnst_plot_png: + type: File? + outputBinding: + glob: "*_fltr_mito_dnst.png" + doc: | + Percentage of transcripts mapped to mitochondrial genes per cell density (filtered). + PNG format + + fltr_mito_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_fltr_mito_dnst.pdf" + doc: | + Percentage of transcripts mapped to mitochondrial genes per cell density (filtered). + PDF format + + fltr_nvlt_dnst_plot_png: + type: File? + outputBinding: + glob: "*_fltr_nvlt_dnst.png" + doc: | + Novelty score per cell density (filtered). + PNG format + + fltr_nvlt_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_fltr_nvlt_dnst.pdf" + doc: | + Novelty score per cell density (filtered). + PDF format + + fltr_qc_mtrcs_dnst_plot_png: + type: File? + outputBinding: + glob: "*_fltr_qc_mtrcs_dnst.png" + doc: | + QC metrics per cell density (filtered). + PNG format + + fltr_qc_mtrcs_dnst_plot_pdf: + type: File? + outputBinding: + glob: "*_fltr_qc_mtrcs_dnst.pdf" + doc: | + QC metrics per cell density (filtered). + PDF format + + fltr_umi_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_fltr_umi_dnst_spl_cnd.png" + doc: | + Split by grouping condition UMI per cell density (filtered). + PNG format + + fltr_umi_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_fltr_umi_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition UMI per cell density (filtered). + PDF format + + fltr_gene_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_fltr_gene_dnst_spl_cnd.png" + doc: | + Split by grouping condition genes per cell density (filtered). + PNG format + + fltr_gene_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_fltr_gene_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition genes per cell density (filtered). + PDF format + + fltr_mito_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_fltr_mito_dnst_spl_cnd.png" + doc: | + Split by grouping condition the percentage of transcripts mapped + to mitochondrial genes per cell density (filtered). + PNG format + + fltr_mito_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_fltr_mito_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition the percentage of transcripts mapped + to mitochondrial genes per cell density (filtered). + PDF format + + fltr_nvlt_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_fltr_nvlt_dnst_spl_cnd.png" + doc: | + Split by grouping condition the novelty score per cell density (filtered). + PNG format + + fltr_nvlt_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_fltr_nvlt_dnst_spl_cnd.pdf" + doc: | + Split by grouping condition the novelty score per cell density (filtered). + PDF format + + ucsc_cb_config_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser" + doc: | + Directory with UCSC Cellbrowser configuration data. + + ucsc_cb_html_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser/html_data" + doc: | + Directory with UCSC Cellbrowser html data. + + ucsc_cb_html_file: + type: File? + outputBinding: + glob: "*_cellbrowser/html_data/index.html" + doc: | + HTML index file from the directory with UCSC Cellbrowser html data. + + seurat_data_rds: + type: File + outputBinding: + glob: "*_data.rds" + doc: | + Filtered Seurat data in RDS format + + seurat_data_h5seurat: + type: File? + outputBinding: + glob: "*_data.h5seurat" + doc: | + Filtered Seurat data in h5seurat format + + seurat_data_h5ad: + type: File? + outputBinding: + glob: "*_data.h5ad" + doc: | + Reduced Seurat data in h5ad format + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["sc_rna_filter.R"] +arguments: +- valueFrom: | + ${ + if (inputs.aggregation_metadata) { + return inputs.aggregation_metadata; + } else { + return runtime.outdir + "/dummy_metadata.csv" + } + } + prefix: "--identity" + + +stdout: sc_rna_filter_stdout.log +stderr: sc_rna_filter_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Single-cell RNA-Seq Filtering Analysis" +s:name: "Single-cell RNA-Seq Filtering Analysis" +s:alternateName: "Filters single-cell RNA-Seq datasets based on the common QC metrics" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-rna-filter.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell RNA-Seq Filtering Analysis + + Filters single-cell RNA-Seq datasets based on the common QC metrics. + + +s:about: | + usage: sc_rna_filter.R + [-h] --mex MEX [MEX ...] --identity IDENTITY [--grouping GROUPING] + [--barcodes BARCODES] [--rnamincells RNAMINCELLS] + [--mingenes [MINGENES ...]] [--maxgenes [MAXGENES ...]] + [--rnaminumi [RNAMINUMI ...]] [--minnovelty [MINNOVELTY ...]] + [--mitopattern MITOPATTERN] [--maxmt MAXMT] [--pdf] [--verbose] + [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + + Single-cell RNA-Seq Filtering Analysis + + options: + -h, --help show this help message and exit + --mex MEX [MEX ...] Path to the folder with feature-barcode matrix from + Cell Ranger Count/Aggregate experiment in MEX format. + If multiple locations provided data is assumed to be + not aggregated (outputs from the multiple Cell Ranger + Count experiments) and will be merged before the + analysis. + --identity IDENTITY Path to the metadata TSV/CSV file to set the datasets + identities. If '--mex' points to the Cell Ranger + Aggregate outputs, the aggregation.csv file can be + used. In case of using feature-barcode matrices from a + single or multiple Cell Ranger Count experiments the + file with identities should include at least one + column - 'library_id', and a row with aliases per each + experiment from the '--mex' input. The order of rows + should correspond to the order of feature-barcode + matrices provided in the '--mex' parameter. + --grouping GROUPING Path to the TSV/CSV file to define datasets grouping. + First column - 'library_id' with the values and order + that correspond to the 'library_id' column from the ' + --identity' file, second column 'condition'. Default: + each dataset is assigned to its own group. + --barcodes BARCODES Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata be selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. Default: all cells used, no extra + metadata is added + --rnamincells RNAMINCELLS + Include only genes detected in at least this many + cells. Ignored when '--mex' points to the feature- + barcode matrices from the multiple Cell Ranger Count + experiments. Default: 5 (applied to all datasets) + --mingenes [MINGENES ...] + Include cells where at least this many genes are + detected. If multiple values provided, each of them + will be applied to the correspondent dataset from the + '--mex' input based on the '--identity' file. Default: + 250 (applied to all datasets) + --maxgenes [MAXGENES ...] + Include cells with the number of genes not bigger than + this value. If multiple values provided, each of them + will be applied to the correspondent dataset from the + '--mex' input based on the '--identity' file. Default: + 5000 (applied to all datasets) + --rnaminumi [RNAMINUMI ...] + Include cells where at least this many UMI + (transcripts) are detected. If multiple values + provided, each of them will be applied to the + correspondent dataset from the '--mex' input based on + the '--identity' file. Default: 500 (applied to all + datasets) + --minnovelty [MINNOVELTY ...] + Include cells with the novelty score not lower than + this value, calculated for as log10(genes)/log10(UMI). + If multiple values provided, each of them will be + applied to the correspondent dataset from the '--mex' + input based on the '--identity' file. Default: 0.8 + (applied to all datasets) + --mitopattern MITOPATTERN + Regex pattern to identify mitochondrial genes. + Default: '^mt-|^MT-' + --maxmt MAXMT Include cells with the percentage of transcripts + mapped to mitochondrial genes not bigger than this + value. Default: 5 (applied to all datasets) + --pdf Export plots in PDF. Default: false + --verbose Print debug information. Default: false + --h5seurat Save Seurat data to h5seurat file. Default: false + --h5ad Save Seurat data to h5ad file. Default: false + --cbbuild Export results to UCSC Cell Browser. Default: false + --output OUTPUT Output prefix. Default: ./sc + --theme {gray,bw,linedraw,light,dark,minimal,classic,void} + Color theme for all generated plots. Default: classic + --cpus CPUS Number of cores/cpus to use. Default: 1 + --memory MEMORY Maximum memory in GB allowed to be shared between the + workers when using multiple '--cpus'. Default: 32 \ No newline at end of file diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl new file mode 100644 index 00000000..573c900b --- /dev/null +++ b/tools/sc-rna-reduce.cwl @@ -0,0 +1,862 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement +- class: EnvVarRequirement + envDef: + R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/sc-tools:v0.0.15 + + +inputs: + + query_data_rds: + type: File + inputBinding: + prefix: "--query" + doc: | + Path to the RDS file to load Seurat object from. This file should include genes + expression information stored in the RNA assay. + + datasets_metadata: + type: File? + inputBinding: + prefix: "--metadata" + doc: | + Path to the TSV/CSV file to optionally extend Seurat object metadata with + categorical values using samples identities. First column - 'library_id' + should correspond to all unique values from the 'new.ident' column of the + loaded Seurat object. If any of the provided in this file columns are already + present in the Seurat object metadata, they will be overwritten. When combined + with --barcodes parameter, first the metadata will be extended, then barcode + filtering will be applied. + Default: no extra metadata is added + + barcodes_data: + type: File? + inputBinding: + prefix: "--barcodes" + doc: | + Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata be selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. + Default: all cells used, no extra metadata is added + + cell_cycle_data: + type: File? + inputBinding: + prefix: "--cellcycle" + doc: | + Path to the TSV/CSV file with the information for cell cycle score assignment. + First column - 'phase', second column 'gene_id'. If loaded Seurat object already + includes cell cycle scores in 'S.Score', 'G2M.Score', and 'CC.Difference' metatada + columns they will be overwritten. + Default: skip cell cycle score assignment. + + normalization_method: + type: + - "null" + - type: enum + symbols: + - "sct" + - "log" + - "sctglm" + inputBinding: + prefix: "--norm" + doc: | + Normalization method applied to genes expression counts. If loaded Seurat object + includes multiple datasets, normalization will be run independently for each of + them, unless integration is disabled with 'none' or set to 'harmony' + Default: sct + + integration_method: + type: + - "null" + - type: enum + symbols: + - "seurat" + - "harmony" + - "none" + inputBinding: + prefix: "--ntgr" + doc: | + Integration method used for joint analysis of multiple datasets. Automatically + set to 'none' if loaded Seurat object includes only one dataset. + Default: seurat + + integrate_by: + type: + - "null" + - string + - string[] + inputBinding: + prefix: "--ntgrby" + doc: | + Column(s) from the Seurat object metadata to define the variable(s) that should + be integrated out when running multiple datasets integration with harmony. May + include columns from the extra metadata added with --metadata parameter. Ignored + if --ntgr is not set to harmony. + Default: new.ident + + highly_var_genes_count: + type: int? + inputBinding: + prefix: "--highvargenes" + doc: | + Number of highly variable genes used in datasets integration, scaling and + dimensionality reduction. + Default: 3000 + + regress_mito_perc: + type: boolean? + inputBinding: + prefix: "--regressmt" + doc: | + Regress the percentage of transcripts mapped to mitochondrial genes as a + confounding source of variation. + Default: false + + regress_genes: + type: + - "null" + - string + - string[] + inputBinding: + prefix: "--regressgenes" + doc: | + Genes which expression should be regressed as a confounding source of variation. + Default: None + + regress_ccycle_full: + type: boolean? + inputBinding: + prefix: "--regressccfull" + doc: | + Regress all signals associated with cell cycle phase. + Ignored if --cellcycle is not provided. Mutually exclusive + with --regressccdiff parameter. + Default: false + + regress_ccycle_diff: + type: boolean? + inputBinding: + prefix: "--regressccdiff" + doc: | + Regress only differences in cell cycle phase among proliferating + cells. Signals separating non-cycling and cycling cells will be + maintained. Ignored if --cellcycle is not provided. Mutually + exclusive with --regressccfull + Default: false + + dimensions: + type: + - "null" + - int + - int[] + inputBinding: + prefix: "--dimensions" + doc: | + Dimensionality to use in UMAP projection (from 1 to 50). If single value N + is provided, use from 1 to N PCs. If multiple values are provided, subset to + only selected PCs. In combination with --ntgr set to harmony, selected principle + components will be used in Harmony integration. + Default: from 1 to 10 + + umap_spread: + type: float? + inputBinding: + prefix: "--uspread" + doc: | + The effective scale of embedded points on UMAP. In combination with '--mindist' + it determines how clustered/clumped the embedded points are. + Default: 1 + + umap_mindist: + type: float? + inputBinding: + prefix: "--umindist" + doc: | + Controls how tightly the embedding is allowed compress points together on UMAP. + Larger values ensure embedded points are moreevenly distributed, while smaller + values allow the algorithm to optimise more accurately with regard to local structure. + Sensible values are in the range 0.001 to 0.5. + Default: 0.3 + + umap_neighbors: + type: int? + inputBinding: + prefix: "--uneighbors" + doc: | + Determines the number of neighboring points used in UMAP. Larger values will result + in more global structure being preserved at the loss of detailed local structure. + In general this parameter should often be in the range 5 to 50. + Default: 30 + + umap_metric: + type: + - "null" + - type: enum + symbols: + - "euclidean" + - "manhattan" + - "chebyshev" + - "minkowski" + - "canberra" + - "braycurtis" + - "mahalanobis" + - "wminkowski" + - "seuclidean" + - "cosine" + - "correlation" + - "haversine" + - "hamming" + - "jaccard" + - "dice" + - "russelrao" + - "kulsinski" + - "ll_dirichlet" + - "hellinger" + - "rogerstanimoto" + - "sokalmichener" + - "sokalsneath" + - "yule" + inputBinding: + prefix: "--umetric" + doc: | + The metric to use to compute distances in high dimensional space for UMAP. + Default: cosine + + umap_method: + type: + - "null" + - type: enum + symbols: + - "uwot" + - "uwot-learn" + - "umap-learn" + inputBinding: + prefix: "--umethod" + doc: | + UMAP implementation to run. If set to 'umap-learn' use --umetric 'correlation' + Default: uwot + + export_pdf_plots: + type: boolean? + inputBinding: + prefix: "--pdf" + doc: | + Export plots in PDF. + Default: false + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + inputBinding: + prefix: "--theme" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + + verbose: + type: boolean? + inputBinding: + prefix: "--verbose" + doc: | + Print debug information. + Default: false + + export_h5seurat_data: + type: boolean? + inputBinding: + prefix: "--h5seurat" + doc: | + Save Seurat data to h5seurat file. + Default: false + + export_h5ad_data: + type: boolean? + inputBinding: + prefix: "--h5ad" + doc: | + Save Seurat data to h5ad file. + Default: false + + export_ucsc_cb: + type: boolean? + inputBinding: + prefix: "--cbbuild" + doc: | + Export results to UCSC Cell Browser. Default: false + + low_memory: + type: boolean? + inputBinding: + prefix: "--lowmem" + doc: | + Attempts to minimize RAM usage when integrating multiple datasets + with SCTransform algorithm (slows down the computation). Ignored if + '--ntgr' is not set to 'seurat' or if '--norm' is not set to either + 'sct' or 'sctglm'. + Default: false + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix. + Default: ./sc + + parallel_memory_limit: + type: int? + inputBinding: + prefix: "--memory" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Default: 32 + + vector_memory_limit: + type: int? + default: 128 + doc: | + Maximum vector memory in GB allowed to be used by R. + Default: 128 + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. + Default: 1 + + +outputs: + + elbow_plot_png: + type: File? + outputBinding: + glob: "*_elbow.png" + doc: | + Elbow plot (from cells PCA). + PNG format + + elbow_plot_pdf: + type: File? + outputBinding: + glob: "*_elbow.pdf" + doc: | + Elbow plot (from cells PCA). + PDF format + + qc_dim_corr_plot_png: + type: File? + outputBinding: + glob: "*_qc_dim_corr.png" + doc: | + Correlation plots between QC metrics and cells PCA components. + PNG format + + qc_dim_corr_plot_pdf: + type: File? + outputBinding: + glob: "*_qc_dim_corr.pdf" + doc: | + Correlation plots between QC metrics and cells PCA components. + PDF format + + umap_qc_mtrcs_plot_png: + type: File? + outputBinding: + glob: "*_umap_qc_mtrcs.png" + doc: | + QC metrics on cells UMAP. + PNG format + + umap_qc_mtrcs_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_qc_mtrcs.pdf" + doc: | + QC metrics on cells UMAP. + PDF format + + umap_plot_png: + type: File? + outputBinding: + glob: "*_umap.png" + doc: | + Cells UMAP. + PNG format + + umap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap.pdf" + doc: | + Cells UMAP. + PDF format + + umap_spl_ph_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_ph.png" + doc: | + Split by cell cycle phase cells UMAP. + PNG format + + umap_spl_ph_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_ph.pdf" + doc: | + Split by cell cycle phase cells UMAP. + PDF format + + ccpca_plot_png: + type: File? + outputBinding: + glob: "*_ccpca.png" + doc: | + Cells PCA using only cell cycle genes. + PNG format + + ccpca_plot_pdf: + type: File? + outputBinding: + glob: "*_ccpca.pdf" + doc: | + Cells PCA using only cell cycle genes. + PDF format + + umap_spl_mito_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_mito.png" + doc: | + Split by the percentage of transcripts mapped to mitochondrial genes cells UMAP. + PNG format + + umap_spl_mito_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_mito.pdf" + doc: | + Split by the percentage of transcripts mapped to mitochondrial genes cells UMAP. + PDF format + + umap_spl_umi_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_umi.png" + doc: | + Split by the UMI per cell counts cells UMAP. + PNG format + + umap_spl_umi_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_umi.pdf" + doc: | + Split by the UMI per cell counts cells UMAP. + PDF format + + umap_spl_gene_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_gene.png" + doc: | + Split by the genes per cell counts cells UMAP. + PNG format + + umap_spl_gene_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_gene.pdf" + doc: | + Split by the genes per cell counts cells UMAP. + PDF format + + umap_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_idnt.png" + doc: | + Split by dataset cells UMAP. + PNG format + + umap_spl_idnt_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_idnt.pdf" + doc: | + Split by dataset cells UMAP. + PDF format + + ccpca_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_ccpca_spl_idnt.png" + doc: | + Split by dataset cells PCA using only cell cycle genes. + PNG format + + ccpca_spl_idnt_plot_pdf: + type: File? + outputBinding: + glob: "*_ccpca_spl_idnt.pdf" + doc: | + Split by dataset cells PCA using only cell cycle genes. + PDF format + + umap_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_cnd.png" + doc: | + Split by grouping condition cells UMAP. + PNG format + + umap_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_cnd.pdf" + doc: | + Split by grouping condition cells UMAP. + PDF format + + umap_gr_cnd_spl_ph_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_ph.png" + doc: | + Grouped by condition split by cell cycle cells UMAP. + PNG format + + umap_gr_cnd_spl_ph_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_ph.pdf" + doc: | + Grouped by condition split by cell cycle cells UMAP. + PDF format + + ccpca_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_ccpca_spl_cnd.png" + doc: | + Split by grouping condition cells PCA using only cell cycle genes. + PNG format + + ccpca_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_ccpca_spl_cnd.pdf" + doc: | + Split by grouping condition cells PCA using only cell cycle genes. + PDF format + + umap_gr_cnd_spl_mito_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_mito.png" + doc: | + Grouped by condition split by the percentage of transcripts mapped to mitochondrial genes cells UMAP. + PNG format + + umap_gr_cnd_spl_mito_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_mito.pdf" + doc: | + Grouped by condition split by the percentage of transcripts mapped to mitochondrial genes cells UMAP. + PDF format + + umap_gr_cnd_spl_umi_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_umi.png" + doc: | + Grouped by condition split by the UMI per cell counts cells UMAP. + PNG format + + umap_gr_cnd_spl_umi_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_umi.pdf" + doc: | + Grouped by condition split by the UMI per cell counts cells UMAP. + PDF format + + umap_gr_cnd_spl_gene_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_gene.png" + doc: | + Grouped by condition split by the genes per cell counts cells UMAP. + PNG format + + umap_gr_cnd_spl_gene_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_gene.pdf" + doc: | + Grouped by condition split by the genes per cell counts cells UMAP. + PDF format + + ucsc_cb_config_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser" + doc: | + Directory with UCSC Cellbrowser configuration data. + + ucsc_cb_html_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser/html_data" + doc: | + Directory with UCSC Cellbrowser html data. + + ucsc_cb_html_file: + type: File? + outputBinding: + glob: "*_cellbrowser/html_data/index.html" + doc: | + HTML index file from the directory with UCSC Cellbrowser html data. + + seurat_data_rds: + type: File + outputBinding: + glob: "*_data.rds" + doc: | + Reduced Seurat data in RDS format + + seurat_data_h5seurat: + type: File? + outputBinding: + glob: "*_data.h5seurat" + doc: | + Reduced Seurat data in h5seurat format + + seurat_data_h5ad: + type: File? + outputBinding: + glob: "*_data.h5ad" + doc: | + Reduced Seurat data in h5ad format + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["sc_rna_reduce.R"] + +stdout: sc_rna_reduce_stdout.log +stderr: sc_rna_reduce_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Single-cell RNA-Seq Dimensionality Reduction Analysis" +s:name: "Single-cell RNA-Seq Dimensionality Reduction Analysis" +s:alternateName: "Integrates multiple single-cell RNA-Seq datasets, reduces dimensionality using PCA" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-rna-reduce.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell RNA-Seq Dimensionality Reduction Analysis + + Integrates multiple single-cell RNA-Seq datasets, reduces dimensionality using PCA. + + +s:about: | + usage: sc_rna_reduce.R + [-h] --query QUERY [--metadata METADATA] [--barcodes BARCODES] + [--cellcycle CELLCYCLE] [--norm {sct,log,sctglm}] + [--ntgr {seurat,harmony,none}] [--ntgrby [NTGRBY ...]] + [--highvargenes HIGHVARGENES] [--regressmt] + [--regressgenes [REGRESSGENES ...]] [--regressccfull | --regressccdiff] + [--dimensions [DIMENSIONS ...]] [--uspread USPREAD] + [--umindist UMINDIST] [--uneighbors UNEIGHBORS] + [--umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis,mahalanobis,wminkowski,seuclidean,cosine,correlation,haversine,hamming,jaccard,dice,russelrao,kulsinski,ll_dirichlet,hellinger,rogerstanimoto,sokalmichener,sokalsneath,yule}] + [--umethod {uwot,uwot-learn,umap-learn}] [--pdf] [--verbose] + [--h5seurat] [--h5ad] [--cbbuild] [--lowmem] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + + Single-cell RNA-Seq Dimensionality Reduction Analysis + + options: + -h, --help show this help message and exit + --query QUERY Path to the RDS file to load Seurat object from. This + file should include genes expression information + stored in the RNA assay. + --metadata METADATA Path to the TSV/CSV file to optionally extend Seurat + object metadata with categorical values using samples + identities. First column - 'library_id' should + correspond to all unique values from the 'new.ident' + column of the loaded Seurat object. If any of the + provided in this file columns are already present in + the Seurat object metadata, they will be overwritten. + When combined with --barcodes parameter, first the + metadata will be extended, then barcode filtering will + be applied. Default: no extra metadata is added + --barcodes BARCODES Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata be selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. Default: all cells used, no extra + metadata is added + --cellcycle CELLCYCLE + Path to the TSV/CSV file with the information for cell + cycle score assignment. First column - 'phase', second + column 'gene_id'. If loaded Seurat object already + includes cell cycle scores in 'S.Score', 'G2M.Score', + and 'CC.Difference' metatada columns they will be + overwritten. Default: skip cell cycle score + assignment. + --norm {sct,log,sctglm} + Normalization method applied to genes expression + counts. If loaded Seurat object includes multiple + datasets, normalization will be run independently for + each of them, unless integration is disabled with + 'none' or set to 'harmony' Default: sct + --ntgr {seurat,harmony,none} + Integration method used for joint analysis of multiple + datasets. Automatically set to 'none' if loaded Seurat + object includes only one dataset. Default: seurat + --ntgrby [NTGRBY ...] + Column(s) from the Seurat object metadata to define + the variable(s) that should be integrated out when + running multiple datasets integration with harmony. + May include columns from the extra metadata added with + --metadata parameter. Ignored if --ntgr is not set to + harmony. Default: new.ident + --highvargenes HIGHVARGENES + Number of highly variable genes used in datasets + integration, scaling and dimensionality reduction. + Default: 3000 + --regressmt Regress the percentage of transcripts mapped to + mitochondrial genes as a confounding source of + variation. Default: false + --regressgenes [REGRESSGENES ...] + Genes which expression should be regressed as a + confounding source of variation. Default: None + --regressccfull Regress all signals associated with cell cycle phase. + Ignored if --cellcycle is not provided. Mutually + exclusive with --regressccdiff parameter. Default: + false + --regressccdiff Regress only differences in cell cycle phase among + proliferating cells. Signals separating non-cycling + and cycling cells will be maintained. Ignored if + --cellcycle is not provided. Mutually exclusive with + --regressccfull Default: false + --dimensions [DIMENSIONS ...] + Dimensionality to use in UMAP projection (from 1 to + 50). If single value N is provided, use from 1 to N + PCs. If multiple values are provided, subset to only + selected PCs. In combination with --ntgr set to + harmony, selected principle components will be used in + Harmony integration. Default: from 1 to 10 + --uspread USPREAD The effective scale of embedded points on UMAP. In + combination with '--mindist' it determines how + clustered/clumped the embedded points are. Default: 1 + --umindist UMINDIST Controls how tightly the embedding is allowed compress + points together on UMAP. Larger values ensure embedded + points are moreevenly distributed, while smaller + values allow the algorithm to optimise more accurately + with regard to local structure. Sensible values are in + the range 0.001 to 0.5. Default: 0.3 + --uneighbors UNEIGHBORS + Determines the number of neighboring points used in + UMAP. Larger values will result in more global + structure being preserved at the loss of detailed + local structure. In general this parameter should + often be in the range 5 to 50. Default: 30 + --umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis,mahalanobis,wminkowski,seuclidean,cosine,correlation,haversine,hamming,jaccard,dice,russelrao,kulsinski,ll_dirichlet,hellinger,rogerstanimoto,sokalmichener,sokalsneath,yule} + The metric to use to compute distances in high + dimensional space for UMAP. Default: cosine + --umethod {uwot,uwot-learn,umap-learn} + UMAP implementation to run. If set to 'umap-learn' use + --umetric 'correlation' Default: uwot + --pdf Export plots in PDF. Default: false + --verbose Print debug information. Default: false + --h5seurat Save Seurat data to h5seurat file. Default: false + --h5ad Save Seurat data to h5ad file. Default: false + --cbbuild Export results to UCSC Cell Browser. Default: false + --lowmem Attempts to minimize RAM usage when integrating + multiple datasets with SCTransform algorithm (slows + down the computation). Ignored if '--ntgr' is not set + to 'seurat' or if '--norm' is not set to either 'sct' + or 'sctglm'. Default: false + --output OUTPUT Output prefix. Default: ./sc + --theme {gray,bw,linedraw,light,dark,minimal,classic,void} + Color theme for all generated plots. Default: classic + --cpus CPUS Number of cores/cpus to use. Default: 1 + --memory MEMORY Maximum memory in GB allowed to be shared between the + workers when using multiple --cpus. Default: 32 \ No newline at end of file diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl new file mode 100644 index 00000000..bb0172d6 --- /dev/null +++ b/tools/sc-triangulate.cwl @@ -0,0 +1,442 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement +- class: EnvVarRequirement + envDef: + R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/sc-tools:v0.0.15 + + +inputs: + + query_data_rds: + type: File + inputBinding: + prefix: "--query" + doc: | + Path to the RDS file to load Seurat object from. This file should include + genes expression and/or chromatin accessibility information stored in the RNA + and/or ATAC assays correspondingly. Additionally, 'rnaumap', and/or 'atacumap', + and/or 'wnnumap' dimensionality reductions should be present. + + barcodes_data: + type: File? + inputBinding: + prefix: "--barcodes" + doc: | + Path to the TSV/CSV file to optionally prefilter and extend Seurat object + metadata be selected barcodes. First column should be named as 'barcode'. + If file includes any other columns they will be added to the Seurat object + metadata ovewriting the existing ones if those are present. + Default: all cells used, no extra metadata is added + + query_source_column: + type: string[] + inputBinding: + prefix: "--source" + doc: | + Columns from the metadata of the loaded Seurat object to select + conflicting cells annotations. + + query_target_column: + type: string? + inputBinding: + prefix: "--target" + doc: | + Suffix to be used as part of the columns names to save label + integration result. + Default: sctri + + export_pdf_plots: + type: boolean? + inputBinding: + prefix: "--pdf" + doc: | + Export plots in PDF. + Default: false + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + inputBinding: + prefix: "--theme" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + + verbose: + type: boolean? + inputBinding: + prefix: "--verbose" + doc: | + Print debug information. + Default: false + + export_h5seurat_data: + type: boolean? + inputBinding: + prefix: "--h5seurat" + doc: | + Save Seurat data to h5seurat file. + Default: false + + export_h5ad_data: + type: boolean? + inputBinding: + prefix: "--h5ad" + doc: | + Save Seurat data to h5ad file. + Default: false + + export_ucsc_cb: + type: boolean? + inputBinding: + prefix: "--cbbuild" + doc: | + Export results to UCSC Cell Browser. Default: false + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix. + Default: ./sc + + parallel_memory_limit: + type: int? + inputBinding: + prefix: "--memory" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Default: 32 + + vector_memory_limit: + type: int? + default: 128 + doc: | + Maximum vector memory in GB allowed to be used by R. + Default: 128 + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. + Default: 1 + + +outputs: + + umap_tril_rd_rnaumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_tril_rd_rnaumap.png" + doc: | + Cells UMAP with integrated labels (rnaumap dim. reduction). + PNG format + + umap_tril_rd_rnaumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_tril_rd_rnaumap.pdf" + doc: | + Cells UMAP with integrated labels (rnaumap dim. reduction). + PDF format + + umap_tril_rd_atacumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_tril_rd_atacumap.png" + doc: | + Cells UMAP with integrated labels (atacumap dim. reduction). + PNG format + + umap_tril_rd_atacumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_tril_rd_atacumap.pdf" + doc: | + Cells UMAP with integrated labels (atacumap dim. reduction). + PDF format + + umap_tril_rd_wnnumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_tril_rd_wnnumap.png" + doc: | + Cells UMAP with integrated labels (wnnumap dim. reduction). + PNG format + + umap_tril_rd_wnnumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_tril_rd_wnnumap.pdf" + doc: | + Cells UMAP with integrated labels (wnnumap dim. reduction). + PDF format + + umap_tria_rd_rnaumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_tria_rd_rnaumap.png" + doc: | + Cells UMAP with winning annotations (rnaumap dim. reduction). + PNG format + + umap_tria_rd_rnaumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_tria_rd_rnaumap.pdf" + doc: | + Cells UMAP with winning annotations (rnaumap dim. reduction). + PDF format + + umap_tria_rd_atacumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_tria_rd_atacumap.png" + doc: | + Cells UMAP with winning annotations (atacumap dim. reduction). + PNG format + + umap_tria_rd_atacumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_tria_rd_atacumap.pdf" + doc: | + Cells UMAP with winning annotations (atacumap dim. reduction). + PDF format + + umap_tria_rd_wnnumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_tria_rd_wnnumap.png" + doc: | + Cells UMAP with winning annotations (wnnumap dim. reduction). + PNG format + + umap_tria_rd_wnnumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_tria_rd_wnnumap.pdf" + doc: | + Cells UMAP with winning annotations (wnnumap dim. reduction). + PDF format + + umap_tric_rd_rnaumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_tric_rd_rnaumap.png" + doc: | + Cells UMAP with integration confidence scores (rnaumap dim. reduction). + PNG format + + umap_tric_rd_rnaumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_tric_rd_rnaumap.pdf" + doc: | + Cells UMAP with integration confidence scores (rnaumap dim. reduction). + PDF format + + umap_tric_rd_atacumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_tric_rd_atacumap.png" + doc: | + Cells UMAP with integration confidence scores (atacumap dim. reduction). + PNG format + + umap_tric_rd_atacumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_tric_rd_atacumap.pdf" + doc: | + Cells UMAP with integration confidence scores (atacumap dim. reduction). + PDF format + + umap_tric_rd_wnnumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_tric_rd_wnnumap.png" + doc: | + Cells UMAP with integration confidence scores (wnnumap dim. reduction). + PNG format + + umap_tric_rd_wnnumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_tric_rd_wnnumap.pdf" + doc: | + Cells UMAP with integration confidence scores (wnnumap dim. reduction). + PDF format + + ucsc_cb_config_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser" + doc: | + Directory with UCSC Cellbrowser configuration data. + + ucsc_cb_html_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser/html_data" + doc: | + Directory with UCSC Cellbrowser html data. + + ucsc_cb_html_file: + type: File? + outputBinding: + glob: "*_cellbrowser/html_data/index.html" + doc: | + HTML index file from the directory with UCSC Cellbrowser html data. + + seurat_data_rds: + type: File + outputBinding: + glob: "*_data.rds" + doc: | + Seurat data in RDS format + + seurat_data_h5seurat: + type: File? + outputBinding: + glob: "*_data.h5seurat" + doc: | + Seurat data in h5seurat format + + seurat_data_h5ad: + type: File? + outputBinding: + glob: "*_data.h5ad" + doc: | + Seurat data in h5ad format + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["sc_triangulate.R"] + +stdout: sc_triangulate_stdout.log +stderr: sc_triangulate_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Single-cell Label Integration Analysis" +s:name: "Single-cell Label Integration Analysis" +s:alternateName: "Harmonizes conflicting annotations in single-cell genomics studies" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-triangulate.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell Label Integration Analysis + + Harmonizes conflicting annotations in single-cell genomics studies. + + +s:about: | + usage: sc_triangulate.R + [-h] --query QUERY [--barcodes BARCODES] --source SOURCE [SOURCE ...] + [--target TARGET] [--pdf] [--verbose] [--h5seurat] [--h5ad] [--cbbuild] + [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + + Single-cell Label Integration Analysis + + options: + -h, --help show this help message and exit + --query QUERY Path to the RDS file to load Seurat object from. This + file should include genes expression and/or chromatin + accessibility information stored in the RNA and/or + ATAC assays correspondingly. Additionally, 'rnaumap', + and/or 'atacumap', and/or 'wnnumap' dimensionality + reductions should be present. + --barcodes BARCODES Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata be selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. Default: all cells used, no extra + metadata is added + --source SOURCE [SOURCE ...] + Columns from the metadata of the loaded Seurat object + to select conflicting cells annotations. + --target TARGET Suffix to be used as part of the columns names to save + label integration result. Default: sctri + --pdf Export plots in PDF. Default: false + --verbose Print debug information. Default: false + --h5seurat Save Seurat data to h5seurat file. Default: false + --h5ad Save Seurat data to h5ad file. Default: false + --cbbuild Export results to UCSC Cell Browser. Default: false + --output OUTPUT Output prefix. Default: ./sc + --theme {gray,bw,linedraw,light,dark,minimal,classic,void} + Color theme for all generated plots. Default: classic + --cpus CPUS Number of cores/cpus to use. Default: 1 + --memory MEMORY Maximum memory in GB allowed to be shared between the + workers when using multiple --cpus. Default: 32 \ No newline at end of file diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl new file mode 100644 index 00000000..d564ee83 --- /dev/null +++ b/tools/sc-wnn-cluster.cwl @@ -0,0 +1,993 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement +- class: EnvVarRequirement + envDef: + R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/sc-tools:v0.0.15 + + +inputs: + + query_data_rds: + type: File + inputBinding: + prefix: "--query" + doc: | + Path to the RDS file to load Seurat object from. This file should include + genes expression and chromatin accessibility information stored in the RNA + and ATAC assays correspondingly. Additionally, 'pca', 'rnaumap', 'atac_lsi' + and 'atacumap' dimensionality reductions should be present. + + rna_dimensions: + type: + - "null" + - int + - int[] + inputBinding: + prefix: "--rnadimensions" + doc: | + Dimensionality from the 'pca' reduction to use when constructing weighted + nearest-neighbor graph before clustering (from 1 to 50). If single value N + is provided, use from 1 to N dimensions. If multiple values are provided, + subset to only selected dimensions. + Default: from 1 to 10 + + atac_dimensions: + type: + - "null" + - int + - int[] + inputBinding: + prefix: "--atacdimensions" + doc: | + Dimensionality from the 'atac_lsi' reduction to use when constructing weighted + nearest-neighbor graph before clustering (from 1 to 50). If single value N + is provided, use from 2 to N dimensions. If multiple values are provided, + subset to only selected dimensions. + Default: from 2 to 10 + + cluster_algorithm: + type: + - "null" + - type: enum + symbols: + - "louvain" + - "mult-louvain" + - "slm" + - "leiden" + inputBinding: + prefix: "--algorithm" + doc: | + Algorithm for modularity optimization when running clustering. + Default: slm + + umap_spread: + type: float? + inputBinding: + prefix: "--uspread" + doc: | + The effective scale of embedded points on UMAP. In combination with '--mindist' + it determines how clustered/clumped the embedded points are. + Default: 1 + + umap_mindist: + type: float? + inputBinding: + prefix: "--umindist" + doc: | + Controls how tightly the embedding is allowed compress points together on UMAP. + Larger values ensure embedded points are moreevenly distributed, while smaller + values allow the algorithm to optimise more accurately with regard to local structure. + Sensible values are in the range 0.001 to 0.5. + Default: 0.3 + + umap_neighbors: + type: int? + inputBinding: + prefix: "--uneighbors" + doc: | + Determines the number of neighboring points used in UMAP. Larger values will result + in more global structure being preserved at the loss of detailed local structure. + In general this parameter should often be in the range 5 to 50. + Default: 30 + + umap_metric: + type: + - "null" + - type: enum + symbols: + - "euclidean" + - "manhattan" + - "chebyshev" + - "minkowski" + - "canberra" + - "braycurtis" + - "mahalanobis" + - "wminkowski" + - "seuclidean" + - "cosine" + - "correlation" + - "haversine" + - "hamming" + - "jaccard" + - "dice" + - "russelrao" + - "kulsinski" + - "ll_dirichlet" + - "hellinger" + - "rogerstanimoto" + - "sokalmichener" + - "sokalsneath" + - "yule" + inputBinding: + prefix: "--umetric" + doc: | + The metric to use to compute distances in high dimensional space for UMAP. + Default: cosine + + umap_method: + type: + - "null" + - type: enum + symbols: + - "uwot" + - "uwot-learn" + - "umap-learn" + inputBinding: + prefix: "--umethod" + doc: | + UMAP implementation to run. If set to 'umap-learn' use --umetric 'correlation' + Default: uwot + + resolution: + type: + - "null" + - float + - float[] + inputBinding: + prefix: "--resolution" + doc: | + Clustering resolution applied to the constructed weighted nearest-neighbor + graph. Can be set as an array but only the first item from the list will + be used for cluster labels and gene/peak markers in the UCSC Cell Browser + when running with --cbbuild and --diffgenes/--diffpeaks parameters. + Default: 0.3, 0.5, 1.0 + + atac_fragments_file: + type: File? + secondaryFiles: + - .tbi + inputBinding: + prefix: "--fragments" + doc: | + Count and barcode information for every ATAC fragment used in the loaded Seurat + object. File should be saved in TSV format with tbi-index file. + + genes_of_interest: + type: + - "null" + - string + - string[] + inputBinding: + prefix: "--genes" + doc: | + Genes of interest to build gene expression and Tn5 insertion frequency plots + for the nearest peaks. If '--fragments' is not provided only gene expression + plots will be built. + Default: None + + identify_diff_genes: + type: boolean? + inputBinding: + prefix: "--diffgenes" + doc: | + Identify differentially expressed genes (putative gene markers) between each + pair of clusters for all resolutions. + Default: false + + identify_diff_peaks: + type: boolean? + inputBinding: + prefix: "--diffpeaks" + doc: | + Identify differentially accessible peaks between each pair of clusters for all resolutions. + Default: false + + rna_minimum_logfc: + type: float? + inputBinding: + prefix: "--rnalogfc" + doc: | + For putative gene markers identification include only those genes that + on average have log fold change difference in expression between every + tested pair of clusters not lower than this value. Ignored if '--diffgenes' + is not set. + Default: 0.25 + + rna_minimum_pct: + type: float? + inputBinding: + prefix: "--rnaminpct" + doc: | + For putative gene markers identification include only those genes that + are detected in not lower than this fraction of cells in either of the + two tested clusters. Ignored if '--diffgenes' is not set. + Default: 0.1 + + only_positive_diff_genes: + type: boolean? + inputBinding: + prefix: "--rnaonlypos" + doc: | + For putative gene markers identification return only positive markers. + Ignored if '--diffgenes' is not set. + Default: false + + rna_test_to_use: + type: + - "null" + - type: enum + symbols: + - "wilcox" + - "bimod" + - "roc" + - "t" + - "negbinom" + - "poisson" + - "LR" + - "MAST" + - "DESeq2" + inputBinding: + prefix: "--rnatestuse" + doc: | + Statistical test to use for putative gene markers identification. + Ignored if '--diffgenes' is not set. + Default: wilcox + + atac_minimum_logfc: + type: float? + inputBinding: + prefix: "--ataclogfc" + doc: | + For differentially accessible peaks identification include only those peaks that + on average have log fold change difference in the chromatin accessibility between + every tested pair of clusters not lower than this value. Ignored if '--diffpeaks' + is not set. + Default: 0.25 + + atac_minimum_pct: + type: float? + inputBinding: + prefix: "--atacminpct" + doc: | + For differentially accessible peaks identification include only those peaks that + are detected in not lower than this fraction of cells in either of the two tested + clusters. Ignored if '--diffpeaks' is not set. + Default: 0.05 + + atac_test_to_use: + type: + - "null" + - type: enum + symbols: + - "wilcox" + - "bimod" + - "roc" + - "t" + - "negbinom" + - "poisson" + - "LR" + - "MAST" + - "DESeq2" + inputBinding: + prefix: "--atactestuse" + doc: | + Statistical test to use for differentially accessible peaks identification. + Ignored if '--diffpeaks' is not set. + Default: LR + + export_pdf_plots: + type: boolean? + inputBinding: + prefix: "--pdf" + doc: | + Export plots in PDF. + Default: false + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + inputBinding: + prefix: "--theme" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + + verbose: + type: boolean? + inputBinding: + prefix: "--verbose" + doc: | + Print debug information. + Default: false + + export_h5seurat_data: + type: boolean? + inputBinding: + prefix: "--h5seurat" + doc: | + Save Seurat data to h5seurat file. + Default: false + + export_h5ad_data: + type: boolean? + inputBinding: + prefix: "--h5ad" + doc: | + Save Seurat data to h5ad file. + Default: false + + export_ucsc_cb: + type: boolean? + inputBinding: + prefix: "--cbbuild" + doc: | + Export results to UCSC Cell Browser. Default: false + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix. + Default: ./sc + + parallel_memory_limit: + type: int? + inputBinding: + prefix: "--memory" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Default: 32 + + vector_memory_limit: + type: int? + default: 128 + doc: | + Maximum vector memory in GB allowed to be used by R. + Default: 128 + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. + Default: 1 + + +outputs: + + umap_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_res_*.png" + doc: | + Clustered cells UMAP. + PNG format + + umap_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_res_*.pdf" + doc: | + Clustered cells UMAP. + PDF format + + umap_spl_idnt_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_idnt_res_*.png" + doc: | + Split by dataset clustered cells UMAP. + PNG format + + umap_spl_idnt_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_idnt_res_*.pdf" + doc: | + Split by dataset clustered cells UMAP. + PDF format + + cmp_gr_clst_spl_idnt_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_clst_spl_idnt_res_*.png" + doc: | + Grouped by cluster split by dataset cells composition plot. Downsampled. + PNG format + + cmp_gr_clst_spl_idnt_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_clst_spl_idnt_res_*.pdf" + doc: | + Grouped by cluster split by dataset cells composition plot. Downsampled. + PDF format + + cmp_gr_idnt_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_idnt_spl_clst_res_*.png" + doc: | + Grouped by dataset split by cluster cells composition plot. Downsampled. + PNG format + + cmp_gr_idnt_spl_clst_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_idnt_spl_clst_res_*.pdf" + doc: | + Grouped by dataset split by cluster cells composition plot. Downsampled. + PDF format + + umap_spl_cnd_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_cnd_res_*.png" + doc: | + Split by grouping condition clustered cells UMAP. + PNG format + + umap_spl_cnd_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_cnd_res_*.pdf" + doc: | + Split by grouping condition clustered cells UMAP. + PDF format + + cmp_gr_clst_spl_cnd_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_clst_spl_cnd_res_*.png" + doc: | + Grouped by cluster split by condition cells composition plot. Downsampled. + PNG format + + cmp_gr_clst_spl_cnd_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_clst_spl_cnd_res_*.pdf" + doc: | + Grouped by cluster split by condition cells composition plot. Downsampled. + PDF format + + cmp_gr_cnd_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_cnd_spl_clst_res_*.png" + doc: | + Grouped by condition split by cluster cells composition plot. Downsampled. + PNG format + + cmp_gr_cnd_spl_clst_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_cnd_spl_clst_res_*.pdf" + doc: | + Grouped by condition split by cluster cells composition plot. Downsampled. + PDF format + + umap_spl_ph_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_ph_res_*.png" + doc: | + Split by cell cycle phase clustered cells UMAP. + PNG format + + umap_spl_ph_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umap_spl_ph_res_*.pdf" + doc: | + Split by cell cycle phase clustered cells UMAP. + PDF format + + cmp_gr_ph_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_cmp_gr_ph_spl_idnt.png" + doc: | + Grouped by cell cycle phase split by dataset cells composition plot. Downsampled. + PNG format + + cmp_gr_ph_spl_idnt_plot_pdf: + type: File? + outputBinding: + glob: "*_cmp_gr_ph_spl_idnt.pdf" + doc: | + Grouped by cell cycle phase split by dataset cells composition plot. Downsampled. + PDF format + + cmp_gr_ph_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_ph_spl_clst_res_*.png" + doc: | + Grouped by cell cycle phase split by cluster cells composition plot. Downsampled. + PNG format + + cmp_gr_ph_spl_clst_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cmp_gr_ph_spl_clst_res_*.pdf" + doc: | + Grouped by cell cycle phase split by cluster cells composition plot. Downsampled. + PDF format + + xpr_avg_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_avg_res_*.png" + doc: | + Log normalized scaled average gene expression per cluster. + PNG format + + xpr_avg_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_avg_res_*.pdf" + doc: | + Log normalized scaled average gene expression per cluster. + PDF format + + xpr_per_cell_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_[!sgnl_]*.png" + doc: | + Log normalized gene expression on cells UMAP. + PNG format + + xpr_per_cell_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_[!sgnl_]*.pdf" + doc: | + Log normalized gene expression on cells UMAP. + PDF format + + xpr_per_cell_sgnl_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_sgnl_*.png" + doc: | + Log normalized gene expression density on cells UMAP. + PNG format + + xpr_per_cell_sgnl_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_sgnl_*.pdf" + doc: | + Log normalized gene expression density on cells UMAP. + PDF format + + xpr_dnst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_dnst_res_*.png" + doc: | + Log normalized gene expression density per cluster. + PNG format + + xpr_dnst_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_dnst_res_*.pdf" + doc: | + Log normalized gene expression density per cluster. + PDF format + + cvrg_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cvrg_res_*.png" + doc: | + Tn5 insertion frequency plot around gene. + PNG format + + cvrg_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cvrg_res_*.pdf" + doc: | + Tn5 insertion frequency plot around gene. + PDF format + + xpr_htmp_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_htmp_res_*.png" + doc: | + Normalized gene expression heatmap grouped by cluster. + PNG format + + xpr_htmp_res_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_htmp_res_*.pdf" + doc: | + Normalized gene expression heatmap grouped by cluster. + PDF format + + gene_markers_tsv: + type: File? + outputBinding: + glob: "*_gene_markers.tsv" + doc: | + Differentially expressed genes between each pair of clusters for all resolutions. + TSV format + + peak_markers_tsv: + type: File? + outputBinding: + glob: "*_peak_markers.tsv" + doc: | + Differentially accessible peaks between each pair of clusters for all resolutions. + TSV format + + ucsc_cb_config_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser" + doc: | + Directory with UCSC Cellbrowser configuration data. + + ucsc_cb_html_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser/html_data" + doc: | + Directory with UCSC Cellbrowser html data. + + ucsc_cb_html_file: + type: File? + outputBinding: + glob: "*_cellbrowser/html_data/index.html" + doc: | + HTML index file from the directory with UCSC Cellbrowser html data. + + seurat_data_rds: + type: File + outputBinding: + glob: "*_data.rds" + doc: | + Reduced Seurat data in RDS format + + seurat_data_h5seurat: + type: File? + outputBinding: + glob: "*_data.h5seurat" + doc: | + Reduced Seurat data in h5seurat format + + seurat_data_h5ad: + type: File? + outputBinding: + glob: "*_data.h5ad" + doc: | + Reduced Seurat data in h5ad format + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["sc_wnn_cluster.R"] + +stdout: sc_wnn_cluster_stdout.log +stderr: sc_wnn_cluster_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Single-cell WNN Cluster Analysis" +s:name: "Single-cell WNN Cluster Analysis" +s:alternateName: "Clusters multiome ATAC and RNA-Seq datasets, identifies gene markers and differentially accessible peaks" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-wnn-cluster.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell WNN Cluster Analysis + + Clusters multiome ATAC and RNA-Seq datasets, identifies gene markers + and differentially accessible peaks. + + +s:about: | + usage: sc_wnn_cluster.R + [-h] --query QUERY [--rnadimensions [RNADIMENSIONS ...]] + [--atacdimensions [ATACDIMENSIONS ...]] + [--algorithm {louvain,mult-louvain,slm,leiden}] [--uspread USPREAD] + [--umindist UMINDIST] [--uneighbors UNEIGHBORS] + [--umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis,mahalanobis,wminkowski,seuclidean,cosine,correlation,haversine,hamming,jaccard,dice,russelrao,kulsinski,ll_dirichlet,hellinger,rogerstanimoto,sokalmichener,sokalsneath,yule}] + [--umethod {uwot,uwot-learn,umap-learn}] + [--resolution [RESOLUTION ...]] [--fragments FRAGMENTS] + [--genes [GENES ...]] [--diffgenes] [--diffpeaks] [--rnalogfc RNALOGFC] + [--rnaminpct RNAMINPCT] [--rnaonlypos] + [--rnatestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] + [--ataclogfc ATACLOGFC] [--atacminpct ATACMINPCT] + [--atactestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] + [--pdf] [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + + Single-cell WNN Cluster Analysis + + options: + -h, --help show this help message and exit + --query QUERY Path to the RDS file to load Seurat object from. This + file should include genes expression and chromatin + accessibility information stored in the RNA and ATAC + assays correspondingly. Additionally, 'pca', + 'rnaumap', 'atac_lsi' and 'atacumap' dimensionality + reductions should be present. + --rnadimensions [RNADIMENSIONS ...] + Dimensionality from the 'pca' reduction to use when + constructing weighted nearest-neighbor graph before + clustering (from 1 to 50). If single value N is + provided, use from 1 to N dimensions. If multiple + values are provided, subset to only selected + dimensions. Default: from 1 to 10 + --atacdimensions [ATACDIMENSIONS ...] + Dimensionality from the 'atac_lsi' reduction to use + when constructing weighted nearest-neighbor graph + before clustering (from 1 to 50). If single value N is + provided, use from 2 to N dimensions. If multiple + values are provided, subset to only selected + dimensions. Default: from 2 to 10 + --algorithm {louvain,mult-louvain,slm,leiden} + Algorithm for modularity optimization when running + clustering. Default: louvain + --uspread USPREAD The effective scale of embedded points on UMAP. In + combination with '--mindist' it determines how + clustered/clumped the embedded points are. Default: 1 + --umindist UMINDIST Controls how tightly the embedding is allowed compress + points together on UMAP. Larger values ensure embedded + points are moreevenly distributed, while smaller + values allow the algorithm to optimise more accurately + with regard to local structure. Sensible values are in + the range 0.001 to 0.5. Default: 0.3 + --uneighbors UNEIGHBORS + Determines the number of neighboring points used in + UMAP. Larger values will result in more global + structure being preserved at the loss of detailed + local structure. In general this parameter should + often be in the range 5 to 50. Default: 30 + --umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis,mahalanobis,wminkowski,seuclidean,cosine,correlation,haversine,hamming,jaccard,dice,russelrao,kulsinski,ll_dirichlet,hellinger,rogerstanimoto,sokalmichener,sokalsneath,yule} + The metric to use to compute distances in high + dimensional space for UMAP. Default: cosine + --umethod {uwot,uwot-learn,umap-learn} + UMAP implementation to run. If set to 'umap-learn' use + --umetric 'correlation' Default: uwot + --resolution [RESOLUTION ...] + Clustering resolution applied to the constructed + weighted nearest-neighbor graph. Can be set as an + array but only the first item from the list will be + used for cluster labels and gene/peak markers in the + UCSC Cell Browser when running with --cbbuild and + --diffgenes/--diffpeaks parameters. Default: 0.3, 0.5, + 1.0 + --fragments FRAGMENTS + Count and barcode information for every ATAC fragment + used in the loaded Seurat object. File should be saved + in TSV format with tbi-index file. + --genes [GENES ...] Genes of interest to build gene expression and Tn5 + insertion frequency plots for the nearest peaks. If ' + --fragments' is not provided only gene expression + plots will be built. Default: None + --diffgenes Identify differentially expressed genes (putative gene + markers) between each pair of clusters for all + resolutions. Default: false + --diffpeaks Identify differentially accessible peaks between each + pair of clusters for all resolutions. Default: false + --rnalogfc RNALOGFC For putative gene markers identification include only + those genes that on average have log fold change + difference in expression between every tested pair of + clusters not lower than this value. Ignored if '-- + diffgenes' is not set. Default: 0.25 + --rnaminpct RNAMINPCT + For putative gene markers identification include only + those genes that are detected in not lower than this + fraction of cells in either of the two tested + clusters. Ignored if '--diffgenes' is not set. + Default: 0.1 + --rnaonlypos For putative gene markers identification return only + positive markers. Ignored if '--diffgenes' is not set. + Default: false + --rnatestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2} + Statistical test to use for putative gene markers + identification. Ignored if '--diffgenes' is not set. + Default: wilcox + --ataclogfc ATACLOGFC + For differentially accessible peaks identification + include only those peaks that on average have log fold + change difference in the chromatin accessibility + between every tested pair of clusters not lower than + this value. Ignored if '--diffpeaks' is not set. + Default: 0.25 + --atacminpct ATACMINPCT + For differentially accessible peaks identification + include only those peaks that are detected in not + lower than this fraction of cells in either of the two + tested clusters. Ignored if '--diffpeaks' is not set. + Default: 0.05 + --atactestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2} + Statistical test to use for differentially accessible + peaks identification. Ignored if '--diffpeaks' is not + set. Default: LR + --pdf Export plots in PDF. Default: false + --verbose Print debug information. Default: false + --h5seurat Save Seurat data to h5seurat file. Default: false + --h5ad Save Seurat data to h5ad file. Default: false + --cbbuild Export results to UCSC Cell Browser. Default: false + --output OUTPUT Output prefix. Default: ./sc + --theme {gray,bw,linedraw,light,dark,minimal,classic,void} + Color theme for all generated plots. Default: classic + --cpus CPUS Number of cores/cpus to use. Default: 1 + --memory MEMORY Maximum memory in GB allowed to be shared between the + workers when using multiple --cpus. Default: 32 \ No newline at end of file diff --git a/tools/tar-compress.cwl b/tools/tar-compress.cwl index 9325c94e..6e492341 100644 --- a/tools/tar-compress.cwl +++ b/tools/tar-compress.cwl @@ -8,7 +8,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/scidap:v0.0.3 + dockerPull: ubuntu:20.04 inputs: @@ -43,7 +43,10 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -s:name: "tar-compress" +label: "TAR compress" +s:name: "TAR compress" +s:alternateName: "Creates compressed TAR file from a folder" + s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/tar-compress.cwl s:codeRepository: https://github.com/Barski-lab/workflows s:license: http://www.apache.org/licenses/LICENSE-2.0 @@ -80,5 +83,8 @@ s:creator: doc: | - Compresses input directory to tar.gz + TAR compress + ========================================= + + Creates compressed TAR file from a folder diff --git a/tools/tar-extract.cwl b/tools/tar-extract.cwl new file mode 100644 index 00000000..83040fd6 --- /dev/null +++ b/tools/tar-extract.cwl @@ -0,0 +1,84 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: + - class: InlineJavascriptRequirement + + +hints: +- class: DockerRequirement + dockerPull: ubuntu:20.04 + + +inputs: + + file_to_extract: + type: File + inputBinding: + position: 1 + doc: "File to extract" + + +outputs: + + extracted_folder: + type: Directory + outputBinding: + glob: "*" + doc: "Extracted folder" + + +baseCommand: ["tar", "xzf"] + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "TAR extract" +s:name: "TAR extract" +s:alternateName: "Extracts the content of TAR file into a folder" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/tar-extract.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + TAR extract + =============================================== + + Extracts the content of TAR file into a folder. \ No newline at end of file diff --git a/workflows/cellranger-aggr.cwl b/workflows/cellranger-aggr.cwl index 56aaa469..a3832a81 100644 --- a/workflows/cellranger-aggr.cwl +++ b/workflows/cellranger-aggr.cwl @@ -10,8 +10,9 @@ requirements: 'sd:upstream': - sc_rnaseq_sample: + sc_experiment: - "single-cell-preprocess-cellranger.cwl" + - "cellranger-multi.cwl" inputs: @@ -23,30 +24,61 @@ inputs: position: 1 molecule_info_h5: - type: File[] - label: "scRNA-Seq Cell Ranger Experiment" + type: + - "null" + - File[] + label: "Single-cell Experiment" doc: "Molecule-level information from individual runs of cellranger count" - 'sd:upstreamSource': "sc_rnaseq_sample/molecule_info_h5" + 'sd:upstreamSource': "sc_experiment/molecule_info_h5" + 'sd:localLabel': true + + filtered_data_folder: + type: + - "null" + - Directory[] + label: "Single-cell Experiment" + doc: "Filtered data folders from individual runs of cellranger multi" + 'sd:upstreamSource': "sc_experiment/filtered_data_folder" 'sd:localLabel': true gem_well_labels: type: string[] - label: "scRNA-Seq Cell Ranger Experiment" + label: "Single-cell Experiment" doc: "Array of GEM well identifiers to be used for labeling purposes only" - 'sd:upstreamSource': "sc_rnaseq_sample/alias" + 'sd:upstreamSource': "sc_experiment/alias" 'sd:localLabel': true normalization_mode: type: - "null" - type: enum - symbols: ["none", "mapped"] + symbols: + - "none" + - "mapped" default: "mapped" label: "Library depth normalization mode" doc: "Library depth normalization mode" 'sd:layout': advanced: true + clonotype_grouping: + type: + - "null" + - type: enum + name: "clonotype_grouping" + symbols: + - "same_donor_different_origins" + - "same_donor_and_origin" + - "different_donors" + default: "different_donors" + label: "Clonotype grouping. Ignored if upstream analysis doesn't include V(D)J data" + doc: | + When cellranger aggr is called with cellranger multi outputs, there are three + ways it can process the datasets depending on the combination of donor and + origin values + 'sd:layout': + advanced: true + threads: type: int? default: 4 @@ -70,8 +102,7 @@ outputs: type: File outputSource: aggregate_counts/web_summary_report label: "Aggregated run summary metrics and charts in HTML format" - doc: | - Aggregated run summary metrics and charts in HTML format + doc: "Aggregated run summary metrics and charts in HTML format" 'sd:visualPlugins': - linkList: tab: 'Overview' @@ -80,99 +111,113 @@ outputs: metrics_summary_report_json: type: File outputSource: aggregate_counts/metrics_summary_report_json - label: "Aggregated run summary metrics in JSON format" - doc: | - Aggregated run summary metrics in JSON format + label: "Aggregated GEX run summary metrics in JSON format" + doc: "Aggregated GEX run summary metrics in JSON format" secondary_analysis_report_folder: type: File outputSource: compress_secondary_analysis_report_folder/compressed_folder label: "Compressed folder with aggregated secondary analysis results" doc: | - Compressed folder with secondary analysis results including dimensionality reduction, - cell clustering, and differential expression of aggregated results + Compressed folder with secondary analysis of GEX data including dimensionality + reduction, cell clustering, and differential expression filtered_feature_bc_matrix_folder: type: File outputSource: compress_filtered_feature_bc_matrix_folder/compressed_folder label: "Compressed folder with aggregated filtered feature-barcode matrices" doc: | - Compressed folder with aggregated filtered feature-barcode matrices containing only cellular barcodes in MEX format + Compressed folder with aggregated filtered feature-barcode matrices + containing only cellular barcodes in MEX format filtered_feature_bc_matrix_h5: type: File outputSource: aggregate_counts/filtered_feature_bc_matrix_h5 label: "Aggregated filtered feature-barcode matrices in HDF5 format" doc: | - Aggregated filtered feature-barcode matrices containing only cellular barcodes in HDF5 format - - raw_feature_bc_matrices_folder: - type: File - outputSource: compress_raw_feature_bc_matrices_folder/compressed_folder - label: "Compressed folder with aggregated unfiltered feature-barcode matrices" - doc: | - Compressed folder with aggregated unfiltered feature-barcode matrices containing all barcodes in MEX format - - raw_feature_bc_matrices_h5: - type: File - outputSource: aggregate_counts/raw_feature_bc_matrices_h5 - label: "Aggregated unfiltered feature-barcode matrices in HDF5 format" - doc: | - Aggregated unfiltered feature-barcode matrices containing all barcodes in HDF5 format - - loupe_browser_track: - type: File - outputSource: aggregate_counts/loupe_browser_track - label: "Loupe Browser visualization and analysis file for aggregated results" - doc: | - Loupe Browser visualization and analysis file for aggregated results + Filtered feature-barcode matrices containing only cellular + barcodes in HDF5 format aggregation_metadata: type: File outputSource: aggregate_counts/aggregation_metadata label: "Aggregation metadata in CSV format" - doc: | - Aggregation metadata in CSV format + doc: "Aggregation metadata in CSV format" - aggregate_counts_stdout_log: - type: File - outputSource: aggregate_counts/stdout_log - label: "stdout log generated by cellranger aggr" - doc: | - stdout log generated by cellranger aggr - - aggregate_counts_stderr_log: + loupe_browser_track: type: File - outputSource: aggregate_counts/stderr_log - label: "stderr log generated by cellranger aggr" - doc: | - stderr log generated by cellranger aggr + outputSource: aggregate_counts/loupe_browser_track + label: "Loupe Browser visualization and analysis file" + doc: "Loupe Browser visualization and analysis file" + + clonotypes_csv: + type: File? + outputSource: aggregate_counts/clonotypes_csv + label: "CSV file with high-level descriptions of each clonotype" + doc: "CSV file with high-level descriptions of each clonotype" + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'V(D)J clonotypes' + Title: 'V(D)J clonotypes' + + consensus_sequences_fasta: + type: File? + outputSource: aggregate_counts/consensus_sequences_fasta + label: "The consensus sequence of each assembled contig" + doc: "The consensus sequence of each assembled contig" + + consensus_annotations_csv: + type: File? + outputSource: aggregate_counts/consensus_annotations_csv + label: "CSV file with high-level and detailed annotations of each clonotype consensus sequence" + doc: "CSV file with high-level and detailed annotations of each clonotype consensus sequence" + + filtered_contig_annotations_csv: + type: File? + outputSource: aggregate_counts/filtered_contig_annotations_csv + label: "CSV file with high-level annotations of each high-confidence contig from cell-associated barcodes" + doc: "CSV file with high-level annotations of each high-confidence contig from cell-associated barcodes" + + loupe_vdj_browser_track: + type: File? + outputSource: aggregate_counts/loupe_vdj_browser_track + label: "Loupe V(D)J Browser visualization and analysis file" + doc: "Loupe V(D)J Browser visualization and analysis file" compressed_html_data_folder: type: File outputSource: compress_html_data_folder/compressed_folder label: "Compressed folder with CellBrowser formatted results" - doc: | - Compressed folder with CellBrowser formatted results + doc: "Compressed folder with CellBrowser formatted results" html_data_folder: type: Directory outputSource: cellbrowser_build/html_data label: "Folder with not compressed CellBrowser formatted results" - doc: | - Folder with not compressed CellBrowser formatted results + doc: "Folder with not compressed CellBrowser formatted results" cellbrowser_report: type: File outputSource: cellbrowser_build/index_html_file label: "CellBrowser formatted Cellranger report" - doc: | - CellBrowser formatted Cellranger report + doc: "CellBrowser formatted Cellranger report" 'sd:visualPlugins': - linkList: tab: 'Overview' target: "_blank" + aggregate_counts_stdout_log: + type: File + outputSource: aggregate_counts/stdout_log + label: "stdout log generated by cellranger aggr" + doc: "stdout log generated by cellranger aggr" + + aggregate_counts_stderr_log: + type: File + outputSource: aggregate_counts/stderr_log + label: "stderr log generated by cellranger aggr" + doc: "stderr log generated by cellranger aggr" + steps: @@ -180,8 +225,10 @@ steps: run: ../tools/cellranger-aggr.cwl in: molecule_info_h5: molecule_info_h5 + filtered_data_folder: filtered_data_folder gem_well_labels: gem_well_labels normalization_mode: normalization_mode + clonotype_grouping: clonotype_grouping threads: threads memory_limit: memory_limit virt_memory_limit: memory_limit @@ -191,10 +238,13 @@ steps: - secondary_analysis_report_folder - filtered_feature_bc_matrix_folder - filtered_feature_bc_matrix_h5 - - raw_feature_bc_matrices_folder - - raw_feature_bc_matrices_h5 - aggregation_metadata - loupe_browser_track + - clonotypes_csv + - consensus_sequences_fasta + - consensus_annotations_csv + - filtered_contig_annotations_csv + - loupe_vdj_browser_track - stdout_log - stderr_log @@ -205,13 +255,6 @@ steps: out: - compressed_folder - compress_raw_feature_bc_matrices_folder: - run: ../tools/tar-compress.cwl - in: - folder_to_compress: aggregate_counts/raw_feature_bc_matrices_folder - out: - - compressed_folder - compress_secondary_analysis_report_folder: run: ../tools/tar-compress.cwl in: @@ -284,4 +327,6 @@ s:creator: doc: | Cell Ranger Aggregate - ===================== + + Aggregates outputs from multiple runs of Cell Ranger Count Gene Expression or + Cell Ranger Multi Gene Expression and V(D)J Repertoire Profiling experiments diff --git a/workflows/cellranger-arc-aggr.cwl b/workflows/cellranger-arc-aggr.cwl index 9b682690..86eccdcb 100644 --- a/workflows/cellranger-arc-aggr.cwl +++ b/workflows/cellranger-arc-aggr.cwl @@ -13,7 +13,8 @@ requirements: sc_rnaseq_sample: - "cellranger-arc-count.cwl" genome_indices: - - "cellranger-mkref.cwl" + - "cellranger-mkref.cwl" + inputs: @@ -36,7 +37,7 @@ inputs: doc: "Molecule-level information from individual runs of cellranger-arc count" 'sd:upstreamSource': "sc_rnaseq_sample/gex_molecule_info_h5" - atac_fragments_file: + atac_fragments_file_from_count: type: File[] secondaryFiles: - .tbi @@ -105,7 +106,7 @@ outputs: doc: | Aggregated run summary metrics in CSV format - atac_fragments_aggr_file: + atac_fragments_file: type: File outputSource: aggregate_counts/atac_fragments_file label: "Aggregated count and barcode information" @@ -225,7 +226,7 @@ steps: aggregate_counts: run: ../tools/cellranger-arc-aggr.cwl in: - atac_fragments_file: atac_fragments_file + atac_fragments_file_from_count: atac_fragments_file_from_count barcode_metrics_report: barcode_metrics_report gex_molecule_info_h5: gex_molecule_info_h5 gem_well_labels: gem_well_labels @@ -300,7 +301,7 @@ label: "Cell Ranger ARC Aggregate" s:name: "Cell Ranger ARC Aggregate" s:alternateName: "Aggregates data from multiple Cell Ranger ARC Count Gene Expression + ATAC experiments" -s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/cellranger-aggr.cwl +s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/cellranger-arc-aggr.cwl s:codeRepository: https://github.com/datirium/workflows s:license: http://www.apache.org/licenses/LICENSE-2.0 diff --git a/workflows/cellranger-arc-count.cwl b/workflows/cellranger-arc-count.cwl index b8a02d41..c75357cf 100644 --- a/workflows/cellranger-arc-count.cwl +++ b/workflows/cellranger-arc-count.cwl @@ -11,7 +11,7 @@ requirements: 'sd:upstream': genome_indices: - - "cellranger-mkref.cwl" + - "cellranger-mkref.cwl" inputs: @@ -35,8 +35,8 @@ inputs: - type: array items: File format: "http://edamontology.org/format_1930" - label: "GEX FASTQ file R1 (optionally compressed)" - doc: "GEX FASTQ file R1 (optionally compressed)" + label: "GEX FASTQ file(s) R1 (optionally compressed)" + doc: "GEX FASTQ file(s) R1 (optionally compressed)" gex_fastq_file_r2: type: @@ -44,8 +44,8 @@ inputs: - type: array items: File format: "http://edamontology.org/format_1930" - label: "GEX FASTQ file R2 (optionally compressed)" - doc: "GEX FASTQ file R2 (optionally compressed)" + label: "GEX FASTQ file(s) R2 (optionally compressed)" + doc: "GEX FASTQ file(s) R2 (optionally compressed)" atac_fastq_file_r1: type: @@ -53,8 +53,8 @@ inputs: - type: array items: File format: "http://edamontology.org/format_1930" - label: "ATAC FASTQ file R1 (optionally compressed)" - doc: "ATAC FASTQ file R1 (optionally compressed)" + label: "ATAC FASTQ file(s) R1 (optionally compressed)" + doc: "ATAC FASTQ file(s) R1 (optionally compressed)" atac_fastq_file_r2: type: @@ -62,8 +62,8 @@ inputs: - type: array items: File format: "http://edamontology.org/format_1930" - label: "ATAC FASTQ file R2 (optionally compressed)" - doc: "ATAC FASTQ file R2 (optionally compressed)" + label: "ATAC FASTQ file(s) R2 (optionally compressed)" + doc: "ATAC FASTQ file(s) R2 (optionally compressed)" atac_fastq_file_r3: type: @@ -71,8 +71,8 @@ inputs: - type: array items: File format: "http://edamontology.org/format_1930" - label: "ATAC FASTQ file R3 (optionally compressed)" - doc: "ATAC FASTQ file R3 (optionally compressed)" + label: "ATAC FASTQ file(s) R3 (optionally compressed)" + doc: "ATAC FASTQ file(s) R3 (optionally compressed)" exclude_introns: type: boolean? @@ -327,15 +327,31 @@ outputs: doc: | stderr log generated by cellranger-arc count - collected_statistics: + collected_statistics_yaml: type: File - outputSource: collect_statistics/collected_statistics + outputSource: collect_statistics/collected_statistics_yaml + label: "Collected statistics in YAML format" + doc: "Collected statistics in YAML format" + + collected_statistics_md: + type: File + outputSource: collect_statistics/collected_statistics_md label: "Collected statistics in Markdown format" doc: "Collected statistics in Markdown format" 'sd:visualPlugins': - markdownView: tab: 'Overview' + collected_statistics_tsv: + type: File + outputSource: collect_statistics/collected_statistics_tsv + label: "Collected statistics in TSV format" + doc: "Collected statistics in TSV format" + 'sd:visualPlugins': + - tableView: + vertical: true + tab: 'Overview' + compressed_html_data_folder: type: File outputSource: compress_html_data_folder/compressed_folder @@ -367,45 +383,45 @@ steps: extract_gex_fastq_r1: run: ../tools/extract-fastq.cwl in: - output_prefix: - default: "gex_read_1" compressed_file: gex_fastq_file_r1 + output_prefix: + default: "gex_read_1" out: - fastq_file extract_gex_fastq_r2: run: ../tools/extract-fastq.cwl in: - output_prefix: - default: "gex_read_2" compressed_file: gex_fastq_file_r2 + output_prefix: + default: "gex_read_2" out: - fastq_file extract_atac_fastq_r1: run: ../tools/extract-fastq.cwl in: - output_prefix: - default: "atac_read_1" compressed_file: atac_fastq_file_r1 + output_prefix: + default: "atac_read_1" out: - fastq_file extract_atac_fastq_r2: run: ../tools/extract-fastq.cwl in: + compressed_file: atac_fastq_file_r2 output_prefix: default: "atac_read_2" - compressed_file: atac_fastq_file_r2 out: - fastq_file extract_atac_fastq_r3: run: ../tools/extract-fastq.cwl in: - output_prefix: - default: "atac_read_3" compressed_file: atac_fastq_file_r3 + output_prefix: + default: "atac_read_3" out: - fastq_file @@ -506,45 +522,13 @@ steps: - compressed_folder collect_statistics: - run: - cwlVersion: v1.0 - class: CommandLineTool - hints: - - class: DockerRequirement - dockerPull: rackspacedot/python37 - inputs: - script: - type: string? - default: | - #!/usr/bin/env python3 - import sys, csv - with open(sys.argv[1], "r") as input_stream: - with open("collected_statistics.md", "w") as output_stream: - output_stream.write("### Cell Ranger ARC Statistics\n") - keys, values = None, None - for i, row in enumerate(csv.reader(input_stream)): - if i==0: - keys = row - else: - values = row - for k,v in zip(keys, values): - output_stream.write("- "+k+": "+v+"\n") - inputBinding: - position: 5 - metrics_summary_report: - type: File - inputBinding: - position: 6 - outputs: - collected_statistics: - type: File - outputBinding: - glob: "*" - baseCommand: ["python3", "-c"] + run: ../tools/collect-stats-sc-arc-count.cwl in: metrics_summary_report: generate_counts_matrix/metrics_summary_report out: - - collected_statistics + - collected_statistics_yaml + - collected_statistics_tsv + - collected_statistics_md cellbrowser_build: run: ../tools/cellbrowser-build-cellranger-arc.cwl diff --git a/workflows/cellranger-mkref.cwl b/workflows/cellranger-mkref.cwl index 9fbec303..18489713 100644 --- a/workflows/cellranger-mkref.cwl +++ b/workflows/cellranger-mkref.cwl @@ -11,7 +11,7 @@ requirements: 'sd:upstream': genome_indices: - - "genome-indices.cwl" + - "genome-indices.cwl" inputs: diff --git a/workflows/cellranger-mkvdjref.cwl b/workflows/cellranger-mkvdjref.cwl new file mode 100644 index 00000000..159fa3ee --- /dev/null +++ b/workflows/cellranger-mkvdjref.cwl @@ -0,0 +1,144 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: InlineJavascriptRequirement + - class: MultipleInputFeatureRequirement + + +inputs: + + alias: + type: string + label: "Experiment short name/Alias" + sd:preview: + position: 1 + + genome_fasta_file: + type: File + label: "Genome FASTA file. Hard/soft-masked files are not allowed." + doc: | + Genome FASTA file. Hard/soft-masked files are not allowed. + For example: + https://ftp.ensembl.org/pub/current_fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz + + annotation_gtf_file: + type: File + label: "GTF annotation file. Should include gene_biotype/transcript_biotype fields." + doc: | + GTF annotation file. Should include gene_biotype/transcript_biotype fields. + For example: + https://ftp.ensembl.org/pub/current_gtf/homo_sapiens/Homo_sapiens.GRCh38.108.gtf.gz + + +outputs: + + indices_folder: + type: Directory + outputSource: cellranger_mkvdjref/indices_folder + label: Cell Ranger V(D)J genome indices + doc: | + Cell Ranger V(D)J-compatible reference folder. + This folder will include V(D)J segment FASTA file. + + stdout_log: + type: File + outputSource: cellranger_mkvdjref/stdout_log + label: stdout log generated by cellranger mkvdjref + doc: | + stdout log generated by cellranger mkvdjref + + stderr_log: + type: File + outputSource: cellranger_mkvdjref/stderr_log + label: stderr log generated by cellranger mkvdjref + doc: | + stderr log generated by cellranger mkvdjref + + +steps: + + extract_fasta: + run: ../tools/extract-7z.cwl + in: + file_to_extract: genome_fasta_file + output_filename: + default: "annotation.fasta" + out: + - extracted_file + + extract_gtf: + run: ../tools/extract-7z.cwl + in: + file_to_extract: annotation_gtf_file + output_filename: + default: "annotation.gtf" + out: + - extracted_file + + cellranger_mkvdjref: + run: ../tools/cellranger-mkvdjref.cwl + in: + genome_fasta_file: extract_fasta/extracted_file + annotation_gtf_file: extract_gtf/extracted_file + output_folder_name: + default: "cellranger_vdj_ref" + out: + - indices_folder + - stdout_log + - stderr_log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Cell Ranger Build V(D)J Reference Indices" +s:name: "Cell Ranger Build V(D)J Reference Indices" +s:alternateName: "Build a Cell Ranger V(D)J-compatible reference folder from a user-supplied genome FASTA and gene GTF files" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/cellranger-mkvdjref.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Cell Ranger Build V(D)J Reference Indices + + Build a Cell Ranger V(D)J-compatible reference folder from + a user-supplied genome FASTA and gene GTF files. \ No newline at end of file diff --git a/workflows/cellranger-multi.cwl b/workflows/cellranger-multi.cwl new file mode 100644 index 00000000..cc0d1790 --- /dev/null +++ b/workflows/cellranger-multi.cwl @@ -0,0 +1,676 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: InlineJavascriptRequirement + - class: MultipleInputFeatureRequirement + + +'sd:upstream': + gex_indices: + - "cellranger-mkref.cwl" + vdj_indices: + - "cellranger-mkvdjref.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name/Alias" + sd:preview: + position: 1 + + gex_indices_folder: + type: Directory + label: "Transcriptome reference" + doc: | + Path of folder containing 10x-compatible transcriptome reference. + Should be generated by "cellranger mkref" command + 'sd:upstreamSource': "gex_indices/indices_folder" + 'sd:localLabel': true + + vdj_indices_folder: + type: Directory + label: "V(D)J reference" + doc: | + Path of folder containing Cell Ranger V(D)J-compatible reference. + Should be generated by "cellranger mkvdjref" command + 'sd:upstreamSource': "vdj_indices/indices_folder" + 'sd:localLabel': true + + gex_fastq_file_r1: + type: + - File + - type: array + items: File + format: "http://edamontology.org/format_1930" + label: "GEX FASTQ file(s) R1 (optionally compressed)" + doc: "GEX FASTQ file(s) R1 (optionally compressed)" + + gex_fastq_file_r2: + type: + - File + - type: array + items: File + format: "http://edamontology.org/format_1930" + label: "GEX FASTQ file(s) R2 (optionally compressed)" + doc: "GEX FASTQ file(s) R2 (optionally compressed)" + + vdj_fastq_file_r1: + type: + - File + - type: array + items: File + format: "http://edamontology.org/format_1930" + label: "V(D)J FASTQ file(s) R1 (optionally compressed)" + doc: "V(D)J FASTQ file(s) R1 (optionally compressed)" + + vdj_fastq_file_r2: + type: + - File + - type: array + items: File + format: "http://edamontology.org/format_1930" + label: "V(D)J FASTQ file(s) R2 (optionally compressed)" + doc: "V(D)J FASTQ file(s) R2 (optionally compressed)" + + vdj_chain_type: + type: + - "null" + - type: enum + name: "chain_type" + symbols: + - "VDJ" + - "VDJ-T" + - "VDJ-B" + - "VDJ-T-GD" + default: "VDJ" + label: "V(D)J chain type. Use VDJ for auto detection." + doc: | + V(D)J chain type. Setting to VDJ will auto-detect the chain type. + Auto-detection does not work for TRG/D (gamma-delta) chains. + Note that gamma-delta analysis is enabled but the algorithm has + not been tested extensively. + 'sd:layout': + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "2" + - "3" + - "4" + default: "2" + label: "Number of cores/cpus to use" + doc: "Number of cores/cpus for those steps that support multithreading" + 'sd:layout': + advanced: true + + memory_limit: + type: int? + default: 20 + label: "Transcriptome reference" + doc: | + Maximum memory used (GB). + The same as was used for generating indices. + The same will be applied to virtual memory + 'sd:upstreamSource': "gex_indices/memory_limit" + 'sd:localLabel': true + + +outputs: + + fastqc_report_gex_fastq_r1: + type: File + outputSource: run_fastqc_for_gex_fastq_r1/html_file + label: "FastqQC report for GEX FASTQ file R1" + doc: | + FastqQC report for GEX FASTQ file R1 + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + fastqc_report_gex_fastq_r2: + type: File + outputSource: run_fastqc_for_gex_fastq_r2/html_file + label: "FastqQC report for GEX FASTQ file R2" + doc: | + FastqQC report for GEX FASTQ file R2 + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + fastqc_report_vdj_fastq_r1: + type: File + outputSource: run_fastqc_for_vdj_fastq_r1/html_file + label: "FastqQC report for V(D)J FASTQ file R1" + doc: | + FastqQC report for V(D)J FASTQ file R1 + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + fastqc_report_vdj_fastq_r2: + type: File + outputSource: run_fastqc_for_vdj_fastq_r2/html_file + label: "FastqQC report for V(D)J FASTQ file R2" + doc: | + FastqQC report for V(D)J FASTQ file R2 + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + web_summary_report: + type: File + outputSource: cellranger_multi/web_summary_report + label: "Gene Expression and V(D)J Repertoire Profiling" + doc: | + Run summary metrics and charts in HTML format + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + metrics_summary_report: + type: File + outputSource: convert_metrics_summary_report_to_tsv/output_file + label: "Run summary metrics in TSV format" + doc: | + Run summary metrics in TSV format + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'QC metrics' + Title: 'QC metrics' + + possorted_genome_bam_bai: + type: File + outputSource: cellranger_multi/possorted_genome_bam_bai + label: "Unaligned and aligned to the genome and transcriptome indexed reads" + doc: | + Indexed GEX BAM file containing position-sorted reads aligned + to the genome and transcriptome, as well as unaligned reads. + + filtered_feature_bc_matrix_folder: + type: File + outputSource: compress_filtered_feature_bc_matrix_folder/compressed_folder + label: "Filtered feature-barcode matrices in MEX format" + doc: | + Folder with filtered feature-barcode matrices containing only cellular + barcodes in MEX format. Each element of the matrix is the number of UMIs + associated with a feature (row) and a barcode (column). + + filtered_feature_bc_matrix_h5: + type: File + outputSource: cellranger_multi/filtered_feature_bc_matrix_h5 + label: "Filtered feature-barcode matrices in HDF5 format" + doc: | + Filtered feature-barcode matrices containing only cellular + barcodes in HDF5 format. Each element of the matrix is the + number of UMIs associated with a feature (row) and a + barcode (column). + + raw_feature_bc_matrices_folder: + type: File + outputSource: compress_raw_feature_bc_matrices_folder/compressed_folder + label: "Unfiltered feature-barcode matrices in MEX format" + doc: | + Folder with unfiltered feature-barcode matrices containing all barcodes + in MEX format. Each element of the matrix is the number of UMIs associated + with a feature (row) and a barcode (column). + + raw_feature_bc_matrices_h5: + type: File + outputSource: cellranger_multi/raw_feature_bc_matrices_h5 + label: "Unfiltered feature-barcode matrices in HDF5 format" + doc: | + Unfiltered feature-barcode matrices containing all barcodes in HDF5 format. + Each element of the matrix is the number of UMIs associated with a feature + (row) and a barcode (column). + + secondary_analysis_report_folder: + type: File + outputSource: compress_secondary_analysis_report_folder/compressed_folder + label: "Folder with secondary analysis of GEX data" + doc: | + Folder with secondary analysis of GEX data including dimensionality + reduction, cell clustering, and differential expression + + loupe_browser_track: + type: File + outputSource: cellranger_multi/loupe_browser_track + label: "Loupe Browser visualization and analysis file" + doc: | + Loupe Browser visualization and analysis file + + all_contig_reads_bam_bai: + type: File + outputSource: cellranger_multi/all_contig_reads_bam_bai + label: "Indexed V(D)J BAM file with reads aligned to ALL assembled contigs, per cell barcode" + doc: | + Indexed V(D)J BAM file with reads aligned to ALL assembled contigs, per cell barcode. + This file demonstrates how the reads and UMIs support the assembled contigs within + a cell barcode. Reads are not aligned across cell barcode boundaries. Please note + that this BAM excludes reads whose barcodes don't match the whitelist, so it is not + suitable as an archive of every single input read. + This file includes reads from all cells barcodes identified by V(D)J algorithm including + those ones that will be later discarded as non-viable cells by V(D)J algorithm and those + barcodes that will be later removed after overlapping with cells called by GEX algorithm. + + all_contig_sequences_fasta: + type: File + outputSource: cellranger_multi/all_contig_sequences_fasta + label: "FASTA format sequence for ALL assembled contigs in the V(D)J library" + doc: | + FASTA format sequence for ALL assembled contigs in the V(D)J library. + This file includes both productive and non-productive contigs with high and low confidence + assembled for all identified cells barcodes including those ones that will be later discarded + as non-viable cells by V(D)J algorithm or after overlapping with cells called by GEX algorithm. + + all_contig_annotations_bed: + type: File + outputSource: cellranger_multi/all_contig_annotations_bed + label: "BED file with high-level and detailed annotations of ALL assembled contigs (from cell and background barcodes)" + doc: | + BED file with high-level and detailed annotations of ALL assembled contigs (from cell and + background barcodes). Used for further investigation into why some contigs were filtered + out. This file includes both productive and non-productive contigs with high and low + confidence assembled for all identified cells barcodes including those ones that will be + later discarded as non-viable cells by V(D)J algorithm or after overlapping with cells + called by GEX algorithm. + + all_contig_annotations_csv: + type: File + outputSource: cellranger_multi/all_contig_annotations_csv + label: "CSV file with high-level and detailed annotations of ALL assembled contigs (from cell and background barcodes)" + doc: | + CSV file with high-level and detailed annotations of ALL assembled contigs (from cell and + background barcodes). Used for further investigation into why some contigs were filtered + out. This file includes both productive and non-productive contigs with high and low + confidence assembled for all identified cells barcodes including those ones that will be + later discarded as non-viable cells by V(D)J algorithm or after overlapping with cells + called by GEX algorithm. + + airr_rearrangement_tsv: + type: File + outputSource: cellranger_multi/airr_rearrangement_tsv + label: "Annotated contigs and consensus sequences of V(D)J rearrangements in the AIRR format" + doc: | + Annotated contigs and consensus sequences of V(D)J rearrangements + in the AIRR format. It includes only viable cells identified by + both V(D)J and GEX algorithms. + + clonotypes_tsv: + type: File + outputSource: convert_clonotypes_csv_to_tsv/output_file + label: "TSV file with high-level descriptions of each clonotype" + doc: | + TSV file with high-level descriptions of each clonotype. During the clonotype + grouping stage, cell barcodes are placed in groups called clonotypes. Only viable + cells identified by both V(D)J and GEX algorithms are used. Each clonotype consists + of all descendants of a single, fully rearranged common ancestor, as approximated + computationally. During this process, some cell barcodes are flagged as likely + artifacts and filtered out, meaning that they are no longer called as cells. + However, as clonotype grouping stage is hapenning before forming the final version + of files in the per_sample_outs folder, the reported cells number won't be affected. + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'V(D)J clonotypes' + Title: 'V(D)J clonotypes' + + germline_contigs_bam_bai: + type: File + outputSource: cellranger_multi/germline_contigs_bam_bai + label: "Indexed V(D)J BAM file with contigs aligned to concatenated germline segments" + doc: | + Indexed V(D)J BAM file with contigs aligned to concatenated germline + segments. For each clonotype consensus, the reference sequence is the + annotated germline segments concatenated together. This file shows how + both the per-cell contigs and the clonotype consensus contig relate to + the germline reference. Useful for revealing polymorphisms, somatic + mutations, and recombination-induced differences such as non-templated + nucleotide additions. + + germline_sequences_fasta: + type: File + outputSource: cellranger_multi/germline_sequences_fasta + label: "Concatenated V(D)J reference segments for the segments detected on each consensus sequence" + doc: | + Concatenated V(D)J reference segments for the segments detected on each + consensus sequence. These serve as an approximate reference for each + consensus sequence. + + consensus_contigs_bam_bai: + type: File + outputSource: cellranger_multi/consensus_contigs_bam_bai + label: "Indexed V(D)J BAM file with contigs aligned to clonotype consensus" + doc: | + Indexed V(D)J BAM file with contigs aligned to clonotype consensus. + Each "reference" sequence is a clonotype consensus sequence, and each + record is an alignment of a single cell's contig against this consensus. + This file shows, for a clonotype consensus sequences, how the constituent + per-cell assemblies support the consensus. + + consensus_sequences_fasta: + type: File + outputSource: cellranger_multi/consensus_sequences_fasta + label: "The consensus sequence of each assembled contig" + doc: | + The consensus sequence of each assembled contig. + + consensus_annotations_csv: + type: File + outputSource: cellranger_multi/consensus_annotations_csv + label: "CSV file with high-level and detailed annotations of each clonotype consensus sequence" + doc: | + CSV file with high-level and detailed annotations of each clonotype + consensus sequence. + + filtered_contig_annotations_csv: + type: File + outputSource: cellranger_multi/filtered_contig_annotations_csv + label: "CSV file with high-level annotations of each high-confidence contig from cell-associated barcodes" + doc: | + CSV file with high-level annotations of each high-confidence contig from + cell-associated barcodes. This is a subset of all_contig_annotations.csv. + + filtered_contig_sequences_fasta: + type: File + outputSource: cellranger_multi/filtered_contig_sequences_fasta + label: "FASTA format sequence for only high-confidence contigs in cell barcodes" + doc: | + FASTA format sequence for only high-confidence contigs in cell barcodes. + + loupe_vdj_browser_track: + type: File + outputSource: cellranger_multi/loupe_vdj_browser_track + label: "Loupe V(D)J Browser visualization and analysis file" + doc: | + Loupe V(D)J Browser visualization and analysis file + + filtered_data_folder: + type: Directory + outputSource: cellranger_multi/filtered_data_folder + label: "Folder containing filtered data, i.e., only cell-associated barcodes" + doc: | + Folder containing filtered data, i.e., only cell-associated barcodes. + Used by cellranger aggr to aggregate samples for joint analysis. + + compressed_html_data_folder: + type: File + outputSource: compress_html_data_folder/compressed_folder + label: "Compressed folder with CellBrowser formatted results" + doc: | + Compressed folder with CellBrowser formatted results + + html_data_folder: + type: Directory + outputSource: cellbrowser_build/html_data + label: "Folder with not compressed CellBrowser formatted results" + doc: | + Folder with not compressed CellBrowser formatted results + + cellbrowser_report: + type: File + outputSource: cellbrowser_build/index_html_file + label: "UCSC Cell Browser" + doc: | + CellBrowser formatted Cellranger report + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + cellranger_multi_stdout_log: + type: File + outputSource: cellranger_multi/stdout_log + label: stdout log generated by cellranger multi + doc: | + stdout log generated by cellranger multi + + cellranger_multi_stderr_log: + type: File + outputSource: cellranger_multi/stderr_log + label: stderr log generated by cellranger multi + doc: | + stderr log generated by cellranger multi + + +steps: + + extract_gex_fastq_r1: + run: ../tools/extract-fastq.cwl + in: + compressed_file: gex_fastq_file_r1 + output_prefix: + default: "gex_read_1" + out: + - fastq_file + + extract_gex_fastq_r2: + run: ../tools/extract-fastq.cwl + in: + compressed_file: gex_fastq_file_r2 + output_prefix: + default: "gex_read_2" + out: + - fastq_file + + extract_vdj_fastq_r1: + run: ../tools/extract-fastq.cwl + in: + compressed_file: vdj_fastq_file_r1 + output_prefix: + default: "vdj_read_1" + out: + - fastq_file + + extract_vdj_fastq_r2: + run: ../tools/extract-fastq.cwl + in: + compressed_file: vdj_fastq_file_r2 + output_prefix: + default: "vdj_read_2" + out: + - fastq_file + + run_fastqc_for_gex_fastq_r1: + run: ../tools/fastqc.cwl + in: + reads_file: extract_gex_fastq_r1/fastq_file + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - html_file + + run_fastqc_for_gex_fastq_r2: + run: ../tools/fastqc.cwl + in: + reads_file: extract_gex_fastq_r2/fastq_file + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - html_file + + run_fastqc_for_vdj_fastq_r1: + run: ../tools/fastqc.cwl + in: + reads_file: extract_vdj_fastq_r1/fastq_file + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - html_file + + run_fastqc_for_vdj_fastq_r2: + run: ../tools/fastqc.cwl + in: + reads_file: extract_vdj_fastq_r2/fastq_file + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - html_file + + cellranger_multi: + run: ../tools/cellranger-multi.cwl + in: + gex_fastq_file_r1: extract_gex_fastq_r1/fastq_file + gex_fastq_file_r2: extract_gex_fastq_r2/fastq_file + vdj_fastq_file_r1: extract_vdj_fastq_r1/fastq_file + vdj_fastq_file_r2: extract_vdj_fastq_r2/fastq_file + gex_indices_folder: gex_indices_folder + vdj_indices_folder: vdj_indices_folder + vdj_chain_type: vdj_chain_type + threads: + source: threads + valueFrom: $(parseInt(self)) + memory_limit: memory_limit + virt_memory_limit: memory_limit + out: + - web_summary_report + - metrics_summary_report + - possorted_genome_bam_bai + - filtered_feature_bc_matrix_folder + - filtered_feature_bc_matrix_h5 + - raw_feature_bc_matrices_folder + - raw_feature_bc_matrices_h5 + - secondary_analysis_report_folder + - loupe_browser_track + - all_contig_reads_bam_bai + - all_contig_sequences_fasta + - all_contig_annotations_bed + - all_contig_annotations_csv + - airr_rearrangement_tsv + - clonotypes_csv + - germline_contigs_bam_bai + - germline_sequences_fasta + - consensus_contigs_bam_bai + - consensus_sequences_fasta + - consensus_annotations_csv + - filtered_contig_annotations_csv + - filtered_contig_sequences_fasta + - loupe_vdj_browser_track + - filtered_data_folder + - stdout_log + - stderr_log + + compress_filtered_feature_bc_matrix_folder: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: cellranger_multi/filtered_feature_bc_matrix_folder + out: + - compressed_folder + + compress_raw_feature_bc_matrices_folder: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: cellranger_multi/raw_feature_bc_matrices_folder + out: + - compressed_folder + + compress_secondary_analysis_report_folder: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: cellranger_multi/secondary_analysis_report_folder + out: + - compressed_folder + + convert_clonotypes_csv_to_tsv: + run: ../tools/custom-bash.cwl + in: + input_file: cellranger_multi/clonotypes_csv + script: + default: | + cat "$0" | tr "," "\t" > `basename $0 csv`tsv + out: + - output_file + + convert_metrics_summary_report_to_tsv: + run: ../tools/custom-bash.cwl + in: + input_file: cellranger_multi/metrics_summary_report + script: + default: | + cat "$0" | tr "," "\t" > `basename $0 csv`tsv + out: + - output_file + + cellbrowser_build: + run: ../tools/cellbrowser-build-cellranger.cwl + in: + secondary_analysis_report_folder: cellranger_multi/secondary_analysis_report_folder + filtered_feature_bc_matrix_folder: cellranger_multi/filtered_feature_bc_matrix_folder + out: + - html_data + - index_html_file + + compress_html_data_folder: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: cellbrowser_build/html_data + out: + - compressed_folder + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Cell Ranger Multi Gene Expression and V(D)J Repertoire Profiling" +s:name: "Cell Ranger Multi Gene Expression and V(D)J Repertoire Profiling" +s:alternateName: "Quantifies gene expression and performs profiling of V(D)J repertoire from a single GEM well" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/cellranger-multi.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Cell Ranger Multi Gene Expression and V(D)J Repertoire Profiling + + Quantifies gene expression and performs profiling of V(D)J + repertoire from a single GEM well \ No newline at end of file diff --git a/workflows/cellranger-reanalyze.cwl b/workflows/cellranger-reanalyze.cwl index 66aca6f0..cb34623b 100644 --- a/workflows/cellranger-reanalyze.cwl +++ b/workflows/cellranger-reanalyze.cwl @@ -10,9 +10,9 @@ requirements: 'sd:upstream': - sc_rnaseq_sample: + sc_experiment: - "single-cell-preprocess-cellranger.cwl" - - "cellranger-aggr.cwl" + - "cellranger-multi.cwl" inputs: @@ -25,14 +25,14 @@ inputs: filtered_feature_bc_matrix_h5: type: File - label: "scRNA-Seq Cell Ranger Experiment" - doc: "Filtered feature-barcode matrices in HDF5 format from cellranger count or aggr results" - 'sd:upstreamSource': "sc_rnaseq_sample/filtered_feature_bc_matrix_h5" + label: "Single-cell Experiment" + doc: "Filtered feature-barcode matrices in HDF5 format from cellranger count/multi" + 'sd:upstreamSource': "sc_experiment/filtered_feature_bc_matrix_h5" 'sd:localLabel': true selected_barcodes: type: File? - label: "A CSV file containing a list of cell barcodes to use for reanalysis" + label: "CSV file containing a list of cell barcodes to use for reanalysis" doc: | A CSV file containing a list of cell barcodes to use for reanalysis, e.g. barcodes exported from Loupe Browser. All barcodes must be present @@ -40,7 +40,7 @@ inputs: selected_genes: type: File? - label: "A CSV file containing a list of gene IDs to use for reanalysis" + label: "CSV file containing a list of gene IDs to use for reanalysis" doc: | A CSV file containing a list of gene IDs to use for reanalysis (corresponding to the gene_id field of the reference GTF). All gene IDs must be present in @@ -49,7 +49,7 @@ inputs: excluded_genes: type: File? - label: "A CSV file containing a list of gene IDs to exclude for reanalysis. Applied after setting selected genes" + label: "CSV file containing a list of gene IDs to exclude for reanalysis. Applied after setting selected genes" doc: | A CSV file containing a list of gene IDs to exclude for reanalysis (corresponding to the gene_id field of the reference GTF). All gene IDs must be present in @@ -57,7 +57,7 @@ inputs: Note that only gene features are used in secondary analysis. Feature Barcode features are ignored. - force_cells_num: + force_cells: type: int? default: null label: "Force pipeline to use this number of cells, bypassing the cell detection algorithm" @@ -427,6 +427,14 @@ outputs: tab: 'Overview' target: "_blank" + filtered_feature_bc_matrix_folder: + type: File + outputSource: compress_filtered_feature_bc_matrix_folder/compressed_folder + label: "Compressed folder with filtered feature-barcode matrices" + doc: | + Compressed folder with filtered feature-barcode matrices containing only cellular barcodes in MEX format. + When implemented, in Targeted Gene Expression samples, the non-targeted genes won't be present. + reanalyze_params: type: File outputSource: reanalyze/reanalyze_params @@ -441,6 +449,31 @@ outputs: doc: | Loupe Browser visualization and analysis file for reanalyzed results + compressed_html_data_folder: + type: File + outputSource: compress_html_data_folder/compressed_folder + label: "Compressed folder with CellBrowser formatted results" + doc: | + Compressed folder with CellBrowser formatted results + + html_data_folder: + type: Directory + outputSource: cellbrowser_build/html_data + label: "Folder with not compressed CellBrowser formatted results" + doc: | + Folder with not compressed CellBrowser formatted results + + cellbrowser_report: + type: File + outputSource: cellbrowser_build/index_html_file + label: "CellBrowser formatted Cellranger report" + doc: | + CellBrowser formatted Cellranger report + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + reanalyze_stdout_log: type: File outputSource: reanalyze/stdout_log @@ -465,7 +498,7 @@ steps: selected_barcodes: selected_barcodes selected_genes: selected_genes excluded_genes: excluded_genes - force_cells_num: force_cells_num + force_cells: force_cells threads: threads memory_limit: memory_limit virt_memory_limit: memory_limit @@ -497,11 +530,19 @@ steps: out: - secondary_analysis_report_folder - web_summary_report + - filtered_feature_bc_matrix_folder - reanalyze_params - loupe_browser_track - stdout_log - stderr_log + compress_filtered_feature_bc_matrix_folder: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: reanalyze/filtered_feature_bc_matrix_folder + out: + - compressed_folder + compress_secondary_analysis_report_folder: run: ../tools/tar-compress.cwl in: @@ -509,6 +550,22 @@ steps: out: - compressed_folder + cellbrowser_build: + run: ../tools/cellbrowser-build-cellranger.cwl + in: + secondary_analysis_report_folder: reanalyze/secondary_analysis_report_folder + filtered_feature_bc_matrix_folder: reanalyze/filtered_feature_bc_matrix_folder + out: + - html_data + - index_html_file + + compress_html_data_folder: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: cellbrowser_build/html_data + out: + - compressed_folder + $namespaces: s: http://schema.org/ @@ -518,7 +575,7 @@ $schemas: label: "Cellranger Reanalyze" s:name: "Cellranger Reanalyze" -s:alternateName: "Reruns secondary analysis for Cell Ranger Count Gene Expression or Cell Ranger Aggregate experiments" +s:alternateName: "Reruns secondary analysis for Cell Ranger Count Gene Expression or Cell Ranger Multi experiments" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/cellranger-reanalyze.cwl s:codeRepository: https://github.com/datirium/workflows diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl new file mode 100644 index 00000000..cfa6ff67 --- /dev/null +++ b/workflows/sc-atac-cluster.cwl @@ -0,0 +1,519 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var split_features = function(line) { + function get_unique(value, index, self) { + return self.indexOf(value) === index && value != ""; + } + let splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + - var split_numbers = function(line) { + let splitted_line = line?line.split(/[\s,]+/).map(parseFloat):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + + +'sd:upstream': + sc_tools_sample: + - "sc-atac-cluster.cwl" + - "sc-rna-cluster.cwl" + - "sc-rna-reduce.cwl" + - "sc-atac-reduce.cwl" + sc_arc_sample: + - "cellranger-arc-count.cwl" + - "cellranger-arc-aggr.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name/alias" + sd:preview: + position: 1 + + query_data_rds: + type: File + label: "Experiment run through Single-cell ATAC-Seq Dimensionality Reduction Analysis" + doc: | + Path to the RDS file to load Seurat object from. This file should include + chromatin accessibility information stored in the ATAC assay, as well as + 'atac_lsi' and 'atacumap' dimensionality reductions applied to that assay. + 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" + 'sd:localLabel': true + + dimensions: + type: int? + default: 40 + label: "Dimensionality to use when constructing nearest-neighbor graph before clustering (from 1 to 50)" + doc: | + Dimensionality to use when constructing nearest-neighbor graph before clustering + (from 1 to 50). If single value N is provided, use from 2 to N dimensions. If + multiple values are provided, subset to only selected dimensions. + Default: from 2 to 10 + + cluster_algorithm: + type: + - "null" + - type: enum + symbols: + - "louvain" + - "mult-louvain" + - "slm" + - "leiden" + default: "slm" + label: "Algorithm for modularity optimization when running clustering" + doc: | + Algorithm for modularity optimization when running clustering. + Default: slm + + resolution: + type: float? + default: 0.3 + label: "Clustering resolution" + doc: | + Clustering resolution applied to the constructed nearest-neighbor graph. + Can be set as an array but only the first item from the list will be used + for cluster labels and peak markers in the UCSC Cell Browser when running + with --cbbuild and --diffpeaks parameters. + Default: 0.3, 0.5, 1.0 + + atac_fragments_file: + type: File? + secondaryFiles: + - .tbi + label: "Cell Ranger ARC Count/Aggregate Experiment" + doc: | + Count and barcode information for every ATAC fragment used in the loaded Seurat + object. File should be saved in TSV format with tbi-index file. + 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" + + genes_of_interest: + type: string? + default: null + label: "Genes of interest to build Tn5 insertion frequency plots for the nearest peaks" + doc: | + Genes of interest to build Tn5 insertion frequency plots for the nearest peaks. + If loaded Seurat object includes genes expression information in the RNA assay + it will be additionally shown on the right side of the plots. + Ignored if '--fragments' is not provided. + Default: None + + identify_diff_peaks: + type: boolean? + default: false + label: "Identify differentially accessible peaks between each pair of clusters" + doc: | + Identify differentially accessible peaks between each pair of clusters for all resolutions. + Default: false + 'sd:layout': + advanced: true + + minimum_logfc: + type: float? + default: 0.25 + label: "Include only those peaks that on average have log fold change difference in the chromatin accessibility between every tested pair of clusters not lower than this value" + doc: | + For differentially accessible peaks identification include only those peaks that + on average have log fold change difference in the chromatin accessibility between + every tested pair of clusters not lower than this value. Ignored if '--diffpeaks' + is not set. + Default: 0.25 + 'sd:layout': + advanced: true + + minimum_pct: + type: float? + default: 0.05 + label: "Include only those peaks that are detected in not lower than this fraction of cells in either of the two tested clusters" + doc: | + For differentially accessible peaks identification include only those peaks that + are detected in not lower than this fraction of cells in either of the two tested + clusters. Ignored if '--diffpeaks' is not set. + Default: 0.05 + 'sd:layout': + advanced: true + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + default: "classic" + label: "Color theme for all generated plots" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + 'sd:layout': + advanced: true + + parallel_memory_limit: + type: + - "null" + - type: enum + symbols: + - "32" + default: "32" + label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Forced to 32 GB + 'sd:layout': + advanced: true + + vector_memory_limit: + type: + - "null" + - type: enum + symbols: + - "64" + default: "64" + label: "Maximum vector memory in GB allowed to be used by R" + doc: | + Maximum vector memory in GB allowed to be used by R. + Forced to 64 GB + 'sd:layout': + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + default: "1" + label: "Number of cores/cpus to use" + doc: | + Number of cores/cpus to use + Forced to 1 + 'sd:layout': + advanced: true + + +outputs: + + umap_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_atac_cluster/umap_res_plot_png + label: "Clustered cells UMAP" + doc: | + Clustered cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Clustered cells UMAP' + + slh_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_atac_cluster/slh_res_plot_png + label: "Silhouette scores. Downsampled to max 500 cells per cluster." + doc: | + Silhouette scores. Downsampled to max 500 cells per cluster. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Silhouette scores. Downsampled to max 500 cells per cluster.' + + umap_spl_idnt_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_atac_cluster/umap_spl_idnt_res_plot_png + label: "Split by dataset clustered cells UMAP" + doc: | + Split by dataset clustered cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by dataset clustered cells UMAP' + + cmp_gr_clst_spl_idnt_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_atac_cluster/cmp_gr_clst_spl_idnt_res_plot_png + label: "Grouped by cluster split by dataset cells composition plot. Downsampled." + doc: | + Grouped by cluster split by dataset cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Grouped by cluster split by dataset cells composition plot. Downsampled.' + + cmp_gr_idnt_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_atac_cluster/cmp_gr_idnt_spl_clst_res_plot_png + label: "Grouped by dataset split by cluster cells composition plot. Downsampled." + doc: | + Grouped by dataset split by cluster cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Grouped by dataset split by cluster cells composition plot. Downsampled.' + + umap_spl_cnd_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_atac_cluster/umap_spl_cnd_res_plot_png + label: "Split by grouping condition clustered cells UMAP" + doc: | + Split by grouping condition clustered cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Split by grouping condition clustered cells UMAP' + + cmp_gr_clst_spl_cnd_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_atac_cluster/cmp_gr_clst_spl_cnd_res_plot_png + label: "Grouped by cluster split by condition cells composition plot. Downsampled." + doc: | + Grouped by cluster split by condition cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Grouped by cluster split by condition cells composition plot. Downsampled.' + + cmp_gr_cnd_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_atac_cluster/cmp_gr_cnd_spl_clst_res_plot_png + label: "Grouped by condition split by cluster cells composition plot. Downsampled." + doc: | + Grouped by condition split by cluster cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Grouped by condition split by cluster cells composition plot. Downsampled.' + + cvrg_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_atac_cluster/cvrg_res_plot_png + label: "Tn5 insertion frequency plot around gene" + doc: | + Tn5 insertion frequency plot around gene. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Genome coverage' + Caption: 'Tn5 insertion frequency plot around gene' + + peak_markers_tsv: + type: File? + outputSource: sc_atac_cluster/peak_markers_tsv + label: "Differentially accessible peaks between each pair of clusters" + doc: | + Differentially accessible peaks between each pair of clusters for all resolutions. + TSV format + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'Diff. peaks' + Title: 'Differentially accessible peaks between each pair of clusters' + + ucsc_cb_config_data: + type: File + outputSource: compress_cellbrowser_config_data/compressed_folder + label: "Compressed directory with UCSC Cellbrowser configuration data" + doc: | + Compressed directory with UCSC Cellbrowser configuration data. + + ucsc_cb_html_data: + type: Directory + outputSource: sc_atac_cluster/ucsc_cb_html_data + label: "Directory with UCSC Cellbrowser html data" + doc: | + Directory with UCSC Cellbrowser html data. + + ucsc_cb_html_file: + type: File + outputSource: sc_atac_cluster/ucsc_cb_html_file + label: "Open in UCSC Cell Browser" + doc: | + HTML index file from the directory with UCSC Cellbrowser html data. + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + seurat_data_rds: + type: File + outputSource: sc_atac_cluster/seurat_data_rds + label: "Processed Seurat data in RDS format" + doc: | + Processed Seurat data in RDS format + + sc_atac_cluster_stdout_log: + type: File + outputSource: sc_atac_cluster/stdout_log + label: "stdout log generated by sc_atac_cluster step" + doc: | + stdout log generated by sc_atac_cluster step + + sc_atac_cluster_stderr_log: + type: File + outputSource: sc_atac_cluster/stderr_log + label: "stderr log generated by sc_atac_cluster step" + doc: | + stderr log generated by sc_atac_cluster step + + +steps: + + sc_atac_cluster: + doc: | + Clusters single-cell ATAC-Seq datasets, identifies differentially + accessible peaks + run: ../tools/sc-atac-cluster.cwl + in: + query_data_rds: query_data_rds + dimensions: dimensions + cluster_metric: + default: euclidean + cluster_algorithm: cluster_algorithm + resolution: resolution + atac_fragments_file: atac_fragments_file + genes_of_interest: + source: genes_of_interest + valueFrom: $(split_features(self)) + identify_diff_peaks: identify_diff_peaks + minimum_logfc: minimum_logfc + minimum_pct: minimum_pct + test_to_use: + default: LR + verbose: + default: true + export_ucsc_cb: + default: true + color_theme: color_theme + parallel_memory_limit: + source: parallel_memory_limit + valueFrom: $(parseInt(self)) + vector_memory_limit: + source: vector_memory_limit + valueFrom: $(parseInt(self)) + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - umap_res_plot_png + - slh_res_plot_png + - umap_spl_idnt_res_plot_png + - cmp_gr_clst_spl_idnt_res_plot_png + - cmp_gr_idnt_spl_clst_res_plot_png + - umap_spl_cnd_res_plot_png + - cmp_gr_clst_spl_cnd_res_plot_png + - cmp_gr_cnd_spl_clst_res_plot_png + - cvrg_res_plot_png + - peak_markers_tsv + - ucsc_cb_config_data + - ucsc_cb_html_data + - ucsc_cb_html_file + - seurat_data_rds + - stdout_log + - stderr_log + + compress_cellbrowser_config_data: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: sc_atac_cluster/ucsc_cb_config_data + out: + - compressed_folder + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Single-cell ATAC-Seq Cluster Analysis" +s:name: "Single-cell ATAC-Seq Cluster Analysis" +s:alternateName: "Clusters single-cell ATAC-Seq datasets, identifies differentially accessible peaks" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-atac-cluster.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell ATAC-Seq Cluster Analysis + + Clusters single-cell ATAC-Seq datasets, identifies differentially + accessible peaks. \ No newline at end of file diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl new file mode 100644 index 00000000..eec05b14 --- /dev/null +++ b/workflows/sc-atac-reduce.cwl @@ -0,0 +1,466 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var split_features = function(line) { + function get_unique(value, index, self) { + return self.indexOf(value) === index && value != ""; + } + let splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + +'sd:upstream': + sc_tools_sample: + - "sc-rna-cluster.cwl" + - "sc-rna-reduce.cwl" + - "sc-multiome-filter.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name/alias" + sd:preview: + position: 1 + + query_data_rds: + type: File + label: "Experiment run through Single-cell Multiome ATAC and RNA-Seq Filtering Analysis" + doc: | + Path to the RDS file to load Seurat object from. This file should include + chromatin accessibility information stored in the ATAC assay. + 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" + 'sd:localLabel': true + + datasets_metadata: + type: File? + label: "Path to the TSV/CSV file to optionally extend Seurat object metadata with categorical values" + doc: | + Path to the TSV/CSV file to optionally extend Seurat + object metadata with categorical values using samples + identities. First column - 'library_id' should + correspond to all unique values from the 'new.ident' + column of the loaded Seurat object. If any of the + provided in this file columns are already present in + the Seurat object metadata, they will be overwritten. + When combined with --barcodes parameter, first the + metadata will be extended, then barcode filtering will + be applied. + Default: no extra metadata is added + + barcodes_data: + type: File? + label: "Optional TSV/CSV file to prefilter and extend metadata be barcodes. First column should be named as 'barcode'" + doc: | + Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata be selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. + Default: all cells used, no extra metadata is added + + dimensions: + type: int? + label: "Dimensionality to use for datasets integration and UMAP projection (from 2 to 50)" + default: 40 + doc: | + Dimensionality to use for datasets integration and + UMAP projection (from 2 to 50). If single value N is + provided, use from 2 to N LSI components. If multiple + values are provided, subset to only selected LSI + components. In combination with --ntgr set to harmony, + selected principle components will be used in Harmony + integration. + Default: from 2 to 10 + + normalization_method: + type: + - "null" + - type: enum + symbols: + - "log-tfidf" + - "tf-logidf" + - "logtf-logidf" + - "idf" + label: "TF-IDF normalization method applied to chromatin accessibility counts" + default: "log-tfidf" + doc: | + TF-IDF normalization method applied to chromatin + accessibility counts. log-tfidf - Stuart & Butler et + al. 2019, tf-logidf - Cusanovich & Hill et al. 2018, + logtf-logidf - Andrew Hill, idf - 10x Genomics, + Default: log-tfidf + 'sd:layout': + advanced: true + + integration_method: + type: + - "null" + - type: enum + symbols: + - "signac" + - "harmony" + - "none" + label: "Integration method used for joint analysis of multiple datasets" + default: "signac" + doc: | + Integration method used for joint analysis of multiple + datasets. Automatically set to 'none' if loaded Suerat + object includes only one dataset. Default: signac + 'sd:layout': + advanced: true + + integrate_by: + type: string? + label: "Variable(s) to be integrated out when running multiple integration with Harmony" + default: "new.ident" + doc: | + Column(s) from the Seurat object metadata to define + the variable(s) that should be integrated out when + running multiple datasets integration with harmony. + May include columns from the extra metadata added with + --metadata parameter. Ignored if --ntgr is not set to + harmony. + Default: new.ident + 'sd:layout': + advanced: true + + minimum_var_peaks_perc: + type: int? + label: "Minimum percentile for identifying the top most common peaks as highly variable" + default: 0 + doc: | + Minimum percentile for identifying the top most common peaks as highly variable. + For example, setting to 5 will use the the top 95 percent most common among all cells + peaks as highly variable. These peaks are used for datasets integration, scaling + and dimensionality reduction. + Default: 0 (use all available peaks) + 'sd:layout': + advanced: true + + umap_spread: + type: float? + label: "UMAP Spread - the effective scale of embedded points (determines how clustered/clumped the embedded points are)" + default: 1 + doc: | + The effective scale of embedded points on UMAP. In combination with '--mindist' + it determines how clustered/clumped the embedded points are. + Default: 1 + 'sd:layout': + advanced: true + + umap_mindist: + type: float? + label: "UMAP Min. Dist. - controls how tightly the embedding is allowed compress points together" + default: 0.3 + doc: | + Controls how tightly the embedding is allowed compress points together on UMAP. + Larger values ensure embedded points are moreevenly distributed, while smaller + values allow the algorithm to optimise more accurately with regard to local structure. + Sensible values are in the range 0.001 to 0.5. + Default: 0.3 + 'sd:layout': + advanced: true + + umap_neighbors: + type: int? + label: "UMAP Neighbors Number - determines the number of neighboring points used" + default: 30 + doc: | + Determines the number of neighboring points used in UMAP. Larger values will result + in more global structure being preserved at the loss of detailed local structure. + In general this parameter should often be in the range 5 to 50. + Default: 30 + 'sd:layout': + advanced: true + + umap_metric: + type: + - "null" + - type: enum + symbols: + - "euclidean" + - "cosine" + - "correlation" + label: "UMAP Dist. Metric - the metric to use to compute distances in high dimensional space" + default: "cosine" + doc: | + The metric to use to compute distances in high dimensional space for UMAP. + Default: cosine + 'sd:layout': + advanced: true + + umap_method: + type: + - "null" + - type: enum + symbols: + - "uwot" + - "uwot-learn" + - "umap-learn" + label: "UMAP implementation to run (if set to 'umap-learn' use 'correlation' distance metric)" + default: "uwot" + doc: | + UMAP implementation to run. If set to 'umap-learn' use --umetric 'correlation' + Default: uwot + 'sd:layout': + advanced: true + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + default: "classic" + label: "Color theme for all generated plots" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + 'sd:layout': + advanced: true + + parallel_memory_limit: + type: + - "null" + - type: enum + symbols: + - "32" + default: "32" + label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Forced to 32 GB + 'sd:layout': + advanced: true + + vector_memory_limit: + type: + - "null" + - type: enum + symbols: + - "96" + default: "96" + label: "Maximum vector memory in GB allowed to be used by R" + doc: | + Maximum vector memory in GB allowed to be used by R. + Forced to 96 GB + 'sd:layout': + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + - "2" + default: "2" + label: "Number of cores/cpus to use" + doc: | + Number of cores/cpus to use + Forced to 2 + 'sd:layout': + advanced: true + + +outputs: + + qc_dim_corr_plot_png: + type: File? + outputSource: sc_atac_reduce/qc_dim_corr_plot_png + label: "Correlation plots between QC metrics and cells LSI dimensions" + doc: | + Correlation plots between QC metrics and cells LSI dimensions. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Correlation plots between QC metrics and cells LSI dimensions' + + umap_qc_mtrcs_plot_png: + type: File? + outputSource: sc_atac_reduce/umap_qc_mtrcs_plot_png + label: "QC metrics on cells UMAP" + doc: | + QC metrics on cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'QC metrics on cells UMAP' + + umap_plot_png: + type: File? + outputSource: sc_atac_reduce/umap_plot_png + label: "Cells UMAP" + doc: | + Cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Cells UMAP' + + umap_spl_idnt_plot_png: + type: File? + outputSource: sc_atac_reduce/umap_spl_idnt_plot_png + label: "Split by dataset cells UMAP" + doc: | + Split by dataset cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by dataset cells UMAP' + + umap_spl_cnd_plot_png: + type: File? + outputSource: sc_atac_reduce/umap_spl_cnd_plot_png + label: "Split by grouping condition cells UMAP" + doc: | + Split by grouping condition cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Split by grouping condition cells UMAP' + + seurat_data_rds: + type: File + outputSource: sc_atac_reduce/seurat_data_rds + label: "Processed Seurat data in RDS format" + doc: | + Processed Seurat data in RDS format + + sc_atac_reduce_stdout_log: + type: File + outputSource: sc_atac_reduce/stdout_log + label: "stdout log generated by sc_atac_reduce step" + doc: | + stdout log generated by sc_atac_reduce step + + sc_atac_reduce_stderr_log: + type: File + outputSource: sc_atac_reduce/stderr_log + label: "stderr log generated by sc_atac_reduce step" + doc: | + stderr log generated by sc_atac_reduce step + + +steps: + + sc_atac_reduce: + doc: | + Integrates multiple single-cell ATAC-Seq datasets, + reduces dimensionality using LSI + run: ../tools/sc-atac-reduce.cwl + in: + query_data_rds: query_data_rds + barcodes_data: barcodes_data + datasets_metadata: datasets_metadata + normalization_method: normalization_method + integration_method: integration_method + integrate_by: + source: integrate_by + valueFrom: $(split_features(self)) + minimum_var_peaks_perc: minimum_var_peaks_perc + dimensions: dimensions + umap_spread: umap_spread + umap_mindist: umap_mindist + umap_neighbors: umap_neighbors + umap_metric: umap_metric + umap_method: umap_method + verbose: + default: true + export_ucsc_cb: + default: false + color_theme: color_theme + parallel_memory_limit: + source: parallel_memory_limit + valueFrom: $(parseInt(self)) + vector_memory_limit: + source: vector_memory_limit + valueFrom: $(parseInt(self)) + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - qc_dim_corr_plot_png + - umap_qc_mtrcs_plot_png + - umap_plot_png + - umap_spl_idnt_plot_png + - umap_spl_cnd_plot_png + - seurat_data_rds + - stdout_log + - stderr_log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Single-cell ATAC-Seq Dimensionality Reduction Analysis" +s:name: "Single-cell ATAC-Seq Dimensionality Reduction Analysis" +s:alternateName: "Integrates multiple single-cell ATAC-Seq datasets, reduces dimensionality using LSI" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-atac-reduce.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell ATAC-Seq Dimensionality Reduction Analysis + + Integrates multiple single-cell ATAC-Seq datasets, + reduces dimensionality using LSI. \ No newline at end of file diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl new file mode 100644 index 00000000..cc45dee6 --- /dev/null +++ b/workflows/sc-ctype-assign.cwl @@ -0,0 +1,823 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var split_features = function(line) { + function get_unique(value, index, self) { + return self.indexOf(value) === index && value != ""; + } + var splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + - var get_query_column = function(prefix, reduction, resolution) { + if (reduction=="RNA") { + return prefix + "rna_res." + resolution; + } else if (reduction=="ATAC") { + return prefix + "atac_res." + resolution; + } else if (reduction=="WNN") { + return prefix + "wsnn_res." + resolution; + } + }; + + +'sd:upstream': + sc_tools_sample: + - "sc-ctype-assign.cwl" + - "sc-rna-cluster.cwl" + - "sc-atac-cluster.cwl" + - "sc-wnn-cluster.cwl" + sc_arc_sample: + - "cellranger-arc-count.cwl" + - "cellranger-arc-aggr.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name/alias" + sd:preview: + position: 1 + + query_data_rds: + type: File + label: "Experiment run through any of the Single-cell Cluster Analysis" + doc: | + Path to the RDS file to load Seurat object from. This file should include + genes expression and/or chromatin accessibility information stored in the RNA + and ATAC assays correspondingly. Additionally, 'rnaumap', and/or 'atacumap', + and/or 'wnnumap' dimensionality reductions should be present. + 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" + 'sd:localLabel': true + + query_reduction: + type: + - "null" + - type: enum + symbols: + - "RNA" + - "ATAC" + - "WNN" + default: "RNA" + label: "Select clusters based on" + doc: | + If set to 'RNA', then 'get_query_column' will have suffix 'rna_res'. + If set to 'ATAC', then 'get_query_column' will have suffix 'atac_res'. + If set to 'WNN', then 'get_query_column' will have suffix 'wsnn_res'. + + query_resolution: + type: float + label: "Clustering resolution to assign cell types to" + doc: | + Clustering resolution defines 'query_source_column' and 'query_target_column' + inputs for 'assign_cell_types' step + + atac_fragments_file: + type: File? + secondaryFiles: + - .tbi + label: "Cell Ranger ARC Count/Aggregate Experiment for ATAC or WNN clusters" + doc: | + Count and barcode information for every ATAC fragment used in the loaded Seurat + object. File should be saved in TSV format with tbi-index file. Ignored if the + loaded Seurat object doesn't include ATAC assay. + 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" + 'sd:localLabel': true + + genes_of_interest: + type: string? + default: null + label: "Genes of interest to build gene expression and/or Tn5 insertion frequency plots for the nearest peaks" + doc: | + Genes of interest to build gene expression and/or Tn5 insertion frequency plots + for the nearest peaks. To build gene expression plots the loaded Seurat object + should include RNA assay. To build Tn5 insertion frequency plots for the nearest + peaks the loaded Seurat object should include ATAC assay as well as the --fragments + file should be provided. + Default: None + + cell_type_data: + type: File + label: "TSV/CSV cell types metadata file with 'cluster' and 'type' columns" + doc: | + Path to the TSV/CSV file for manual cell type assignment for each of the clusters. + First column - 'cluster', second column may have arbitrary name. + + identify_diff_genes: + type: boolean? + default: false + label: "Identify differentially expressed genes for assigned cell types" + doc: | + Identify differentially expressed genes (putative gene markers) for + assigned cell types. Ignored if loaded Seurat object doesn't include + genes expression information stored in the RNA assay. + Default: false + 'sd:layout': + advanced: true + + identify_diff_peaks: + type: boolean? + default: false + label: "Identify differentially accessible peaks for assigned cell types" + doc: | + Identify differentially accessible peaks for assigned cell types. Ignored + if loaded Seurat object doesn't include chromatin accessibility information + stored in the ATAC assay. + Default: false + 'sd:layout': + advanced: true + + rna_minimum_logfc: + type: float? + default: 0.25 + label: "Include only those genes that on average have log fold change difference in expression between every tested pair of cell types not lower than this value" + doc: | + For putative gene markers identification include only those genes that + on average have log fold change difference in expression between every + tested pair of cell types not lower than this value. Ignored if '--diffgenes' + is not set or RNA assay is not present. + Default: 0.25 + 'sd:layout': + advanced: true + + rna_minimum_pct: + type: float? + default: 0.1 + label: "Include only those genes that are detected in not lower than this fraction of cells in either of the two tested cell types" + doc: | + For putative gene markers identification include only those genes that + are detected in not lower than this fraction of cells in either of the + two tested cell types. Ignored if '--diffgenes' is not set or RNA assay + is not present. + Default: 0.1 + 'sd:layout': + advanced: true + + atac_minimum_logfc: + type: float? + default: 0.25 + label: "Include only those peaks that on average have log fold change difference in the chromatin accessibility between every tested pair of cell types not lower than this value" + doc: | + For differentially accessible peaks identification include only those peaks that + on average have log fold change difference in the chromatin accessibility between + every tested pair of cell types not lower than this value. Ignored if '--diffpeaks' + is not set or ATAC assay is not present. + Default: 0.25 + 'sd:layout': + advanced: true + + atac_minimum_pct: + type: float? + default: 0.05 + label: "Include only those peaks that are detected in not lower than this fraction of cells in either of the two tested cell types" + doc: | + For differentially accessible peaks identification include only those peaks that + are detected in not lower than this fraction of cells in either of the two tested + cell types. Ignored if '--diffpeaks' is not set or ATAC assay is not present. + Default: 0.05 + 'sd:layout': + advanced: true + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + default: "classic" + label: "Color theme for all generated plots" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + 'sd:layout': + advanced: true + + parallel_memory_limit: + type: + - "null" + - type: enum + symbols: + - "32" + default: "32" + label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Forced to 32 GB + 'sd:layout': + advanced: true + + vector_memory_limit: + type: + - "null" + - type: enum + symbols: + - "64" + default: "64" + label: "Maximum vector memory in GB allowed to be used by R" + doc: | + Maximum vector memory in GB allowed to be used by R. + Forced to 64 GB + 'sd:layout': + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + default: "1" + label: "Number of cores/cpus to use" + doc: | + Number of cores/cpus to use + Forced to 1 + 'sd:layout': + advanced: true + + +outputs: + + umap_rd_rnaumap_plot_png: + type: File? + outputSource: ctype_assign/umap_rd_rnaumap_plot_png + label: "Clustered cells RNA UMAP with assigned cell types" + doc: | + Cells UMAP with assigned cell types (rnaumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Clustered cells RNA UMAP with assigned cell types' + + umap_rd_atacumap_plot_png: + type: File? + outputSource: ctype_assign/umap_rd_atacumap_plot_png + label: "Clustered cells ATAC UMAP with assigned cell types" + doc: | + Cells UMAP with assigned cell types (atacumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Clustered cells ATAC UMAP with assigned cell types' + + umap_rd_wnnumap_plot_png: + type: File? + outputSource: ctype_assign/umap_rd_wnnumap_plot_png + label: "Clustered cells WNN UMAP with assigned cell types" + doc: | + Cells UMAP with assigned cell types (wnnumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Clustered cells WNN UMAP with assigned cell types' + + umap_spl_idnt_rd_rnaumap_plot_png: + type: File? + outputSource: ctype_assign/umap_spl_idnt_rd_rnaumap_plot_png + label: "Split by dataset clustered cells RNA UMAP with assigned cell types" + doc: | + Split by dataset cells UMAP with assigned cell types (rnaumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by dataset clustered cells RNA UMAP with assigned cell types' + + umap_spl_idnt_rd_atacumap_plot_png: + type: File? + outputSource: ctype_assign/umap_spl_idnt_rd_atacumap_plot_png + label: "Split by dataset clustered cells ATAC UMAP with assigned cell types" + doc: | + Split by dataset cells UMAP with assigned cell types (atacumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by dataset clustered cells ATAC UMAP with assigned cell types' + + umap_spl_idnt_rd_wnnumap_plot_png: + type: File? + outputSource: ctype_assign/umap_spl_idnt_rd_wnnumap_plot_png + label: "Split by dataset clustered cells WNN UMAP with assigned cell types" + doc: | + Split by dataset cells UMAP with assigned cell types (wnnumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by dataset clustered cells WNN UMAP with assigned cell types' + + umap_spl_cnd_rd_rnaumap_plot_png: + type: File? + outputSource: ctype_assign/umap_spl_cnd_rd_rnaumap_plot_png + label: "Split by grouping condition clustered cells RNA UMAP with assigned cell types" + doc: | + Split by grouping condition cells UMAP with assigned cell types (rnaumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Split by grouping condition clustered cells RNA UMAP with assigned cell types' + + umap_spl_cnd_rd_atacumap_plot_png: + type: File? + outputSource: ctype_assign/umap_spl_cnd_rd_atacumap_plot_png + label: "Split by grouping condition clustered cells ATAC UMAP with assigned cell types" + doc: | + Split by grouping condition cells UMAP with assigned cell types (atacumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Split by grouping condition clustered cells ATAC UMAP with assigned cell types' + + umap_spl_cnd_rd_wnnumap_plot_png: + type: File? + outputSource: ctype_assign/umap_spl_cnd_rd_wnnumap_plot_png + label: "Split by grouping condition clustered cells WNN UMAP with assigned cell types" + doc: | + Split by grouping condition cells UMAP with assigned cell types (wnnumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Split by grouping condition clustered cells WNN UMAP with assigned cell types' + + umap_spl_ph_rd_rnaumap_plot_png: + type: File? + outputSource: ctype_assign/umap_spl_ph_rd_rnaumap_plot_png + label: "Split by cell cycle phase cells RNA UMAP with assigned cell types" + doc: | + Split by cell cycle phase cells UMAP with assigned cell types (rnaumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by cell cycle phase cells RNA UMAP with assigned cell types' + + umap_spl_ph_rd_atacumap_plot_png: + type: File? + outputSource: ctype_assign/umap_spl_ph_rd_atacumap_plot_png + label: "Split by cell cycle phase cells ATAC UMAP with assigned cell types" + doc: | + Split by cell cycle phase cells UMAP with assigned cell types (atacumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by cell cycle phase cells ATAC UMAP with assigned cell types' + + umap_spl_ph_rd_wnnumap_plot_png: + type: File? + outputSource: ctype_assign/umap_spl_ph_rd_wnnumap_plot_png + label: "Split by cell cycle phase cells WNN UMAP with assigned cell types" + doc: | + Split by cell cycle phase cells UMAP with assigned cell types (wnnumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by cell cycle phase cells WNN UMAP with assigned cell types' + + cmp_gr_ctyp_spl_idnt_plot_png: + type: File? + outputSource: ctype_assign/cmp_gr_ctyp_spl_idnt_plot_png + label: "Grouped by cell type split by dataset cells composition plot. Downsampled." + doc: | + Grouped by cell type split by dataset cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Grouped by cell type split by dataset cells composition plot. Downsampled.' + + cmp_gr_idnt_spl_ctyp_plot_png: + type: File? + outputSource: ctype_assign/cmp_gr_idnt_spl_ctyp_plot_png + label: "Grouped by dataset split by cell type cells composition plot. Downsampled." + doc: | + Grouped by dataset split by cell type cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Grouped by dataset split by cell type cells composition plot. Downsampled.' + + cmp_gr_ph_spl_idnt_plot_png: + type: File? + outputSource: ctype_assign/cmp_gr_ph_spl_idnt_plot_png + label: "Grouped by cell cycle phase split by dataset cells composition plot. Downsampled." + doc: | + Grouped by cell cycle phase split by dataset cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Grouped by cell cycle phase split by dataset cells composition plot. Downsampled.' + + cmp_gr_ctyp_spl_cnd_plot_png: + type: File? + outputSource: ctype_assign/cmp_gr_ctyp_spl_cnd_plot_png + label: "Grouped by cell type split by condition cells composition plot. Downsampled." + doc: | + Grouped by cell type split by condition cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Grouped by cell type split by condition cells composition plot. Downsampled.' + + cmp_gr_cnd_spl_ctyp_plot_png: + type: File? + outputSource: ctype_assign/cmp_gr_cnd_spl_ctyp_plot_png + label: "Grouped by condition split by cell type cells composition plot. Downsampled." + doc: | + Grouped by condition split by cell type cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Grouped by condition split by cell type cells composition plot. Downsampled.' + + cmp_gr_ph_spl_ctyp_plot_png: + type: File? + outputSource: ctype_assign/cmp_gr_ph_spl_ctyp_plot_png + label: "Grouped by cell cycle phase split by cell type cells composition plot. Downsampled." + doc: | + Grouped by cell cycle phase split by cell type cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Grouped by cell cycle phase split by cell type cells composition plot. Downsampled.' + + xpr_avg_plot_png: + type: File? + outputSource: ctype_assign/xpr_avg_plot_png + label: "Log normalized scaled average gene expression per cell type" + doc: | + Log normalized scaled average gene expression per cell type. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized scaled average gene expression per cell type' + + xpr_dnst_plot_png: + type: + - "null" + - type: array + items: File + outputSource: ctype_assign/xpr_dnst_plot_png + label: "Log normalized gene expression density per cell type" + doc: | + Log normalized gene expression density per cell type. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized gene expression density per cell type' + + xpr_per_cell_rd_rnaumap_plot_png: + type: + - "null" + - type: array + items: File + outputSource: ctype_assign/xpr_per_cell_rd_rnaumap_plot_png + label: "Log normalized gene expression on cells RNA UMAP with assigned cell types" + doc: | + Log normalized gene expression on cells UMAP with assigned cell types (rnaumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized gene expression on cells RNA UMAP with assigned cell types' + + xpr_per_cell_rd_atacumap_plot_png: + type: + - "null" + - type: array + items: File + outputSource: ctype_assign/xpr_per_cell_rd_atacumap_plot_png + label: "Log normalized gene expression on cells ATAC UMAP with assigned cell types" + doc: | + Log normalized gene expression on cells UMAP with assigned cell types (atacumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized gene expression on cells ATAC UMAP with assigned cell types' + + xpr_per_cell_rd_wnnumap_plot_png: + type: + - "null" + - type: array + items: File + outputSource: ctype_assign/xpr_per_cell_rd_wnnumap_plot_png + label: "Log normalized gene expression on cells WNN UMAP with assigned cell types" + doc: | + Log normalized gene expression on cells UMAP with assigned cell types (wnnumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized gene expression on cells WNN UMAP with assigned cell types' + + xpr_per_cell_sgnl_rd_rnaumap_plot_png: + type: + - "null" + - type: array + items: File + outputSource: ctype_assign/xpr_per_cell_sgnl_rd_rnaumap_plot_png + label: "Log normalized gene expression density on cells RNA UMAP with assigned cell types" + doc: | + Log normalized gene expression density on cells UMAP with assigned cell types (rnaumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized gene expression density on cells RNA UMAP with assigned cell types' + + xpr_per_cell_sgnl_rd_atacumap_plot_png: + type: + - "null" + - type: array + items: File + outputSource: ctype_assign/xpr_per_cell_sgnl_rd_atacumap_plot_png + label: "Log normalized gene expression density on cells ATAC UMAP with assigned cell types" + doc: | + Log normalized gene expression density on cells UMAP with assigned cell types (atacumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized gene expression density on cells ATAC UMAP with assigned cell types' + + xpr_per_cell_sgnl_rd_wnnumap_plot_png: + type: + - "null" + - type: array + items: File + outputSource: ctype_assign/xpr_per_cell_sgnl_rd_wnnumap_plot_png + label: "Log normalized gene expression density on cells WNN UMAP with assigned cell types" + doc: | + Log normalized gene expression density on cells UMAP with assigned cell types (wnnumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized gene expression density on cells WNN UMAP with assigned cell types' + + cvrg_plot_png: + type: + - "null" + - type: array + items: File + outputSource: ctype_assign/cvrg_plot_png + label: "Tn5 insertion frequency plot around gene" + doc: | + Tn5 insertion frequency plot around gene. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Genome coverage' + Caption: 'Tn5 insertion frequency plot around gene' + + xpr_htmp_plot_png: + type: File? + outputSource: ctype_assign/xpr_htmp_plot_png + label: "Normalized gene expression heatmap grouped by cell type" + doc: | + Normalized gene expression heatmap grouped by cell type. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Normalized gene expression heatmap grouped by cell type' + + gene_markers_tsv: + type: File? + outputSource: ctype_assign/gene_markers_tsv + label: "Differentially expressed genes between each pair of cell types" + doc: | + Differentially expressed genes between each pair of cell types. + TSV format + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'Gene markers' + Title: 'Differentially expressed genes between each pair of cell types' + + peak_markers_tsv: + type: File? + outputSource: ctype_assign/peak_markers_tsv + label: "Differentially accessible peaks between each pair of cell types" + doc: | + Differentially accessible peaks between each pair of cell types. + TSV format + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'Diff. peaks' + Title: 'Differentially accessible peaks between each pair of cell types' + + ucsc_cb_config_data: + type: File + outputSource: compress_cellbrowser_config_data/compressed_folder + label: "Compressed directory with UCSC Cellbrowser configuration data" + doc: | + Compressed directory with UCSC Cellbrowser configuration data. + + ucsc_cb_html_data: + type: Directory + outputSource: ctype_assign/ucsc_cb_html_data + label: "Directory with UCSC Cellbrowser html data" + doc: | + Directory with UCSC Cellbrowser html data. + + ucsc_cb_html_file: + type: File + outputSource: ctype_assign/ucsc_cb_html_file + label: "Open in UCSC Cell Browser" + doc: | + HTML index file from the directory with UCSC Cellbrowser html data. + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + seurat_data_rds: + type: File + outputSource: ctype_assign/seurat_data_rds + label: "Processed Seurat data in RDS format" + doc: | + Processed Seurat data in RDS format + + ctype_assign_stdout_log: + type: File + outputSource: ctype_assign/stdout_log + label: "stdout log generated by ctype_assign step" + doc: | + stdout log generated by ctype_assign step + + ctype_assign_stderr_log: + type: File + outputSource: ctype_assign/stderr_log + label: "stderr log generated by ctype_assign step" + doc: | + stderr log generated by ctype_assign step + + +steps: + + ctype_assign: + run: ../tools/sc-ctype-assign.cwl + in: + query_data_rds: query_data_rds + cell_type_data: cell_type_data + query_source_column: + source: [query_reduction, query_resolution] + valueFrom: $(get_query_column("", self[0], self[1])) + query_target_column: + source: [query_reduction, query_resolution] + valueFrom: $(get_query_column("custom_", self[0], self[1])) + atac_fragments_file: atac_fragments_file + genes_of_interest: + source: genes_of_interest + valueFrom: $(split_features(self)) + identify_diff_genes: identify_diff_genes + identify_diff_peaks: identify_diff_peaks + rna_minimum_logfc: rna_minimum_logfc + rna_minimum_pct: rna_minimum_pct + atac_minimum_logfc: atac_minimum_logfc + atac_minimum_pct: atac_minimum_pct + only_positive_diff_genes: + default: true + rna_test_to_use: + default: wilcox + atac_test_to_use: + default: LR + verbose: + default: true + export_ucsc_cb: + default: true + color_theme: color_theme + parallel_memory_limit: + source: parallel_memory_limit + valueFrom: $(parseInt(self)) + vector_memory_limit: + source: vector_memory_limit + valueFrom: $(parseInt(self)) + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - umap_rd_rnaumap_plot_png + - umap_rd_atacumap_plot_png + - umap_rd_wnnumap_plot_png + - umap_spl_idnt_rd_rnaumap_plot_png + - umap_spl_idnt_rd_atacumap_plot_png + - umap_spl_idnt_rd_wnnumap_plot_png + - umap_spl_cnd_rd_rnaumap_plot_png + - umap_spl_cnd_rd_atacumap_plot_png + - umap_spl_cnd_rd_wnnumap_plot_png + - umap_spl_ph_rd_rnaumap_plot_png + - umap_spl_ph_rd_atacumap_plot_png + - umap_spl_ph_rd_wnnumap_plot_png + - cmp_gr_ctyp_spl_idnt_plot_png + - cmp_gr_idnt_spl_ctyp_plot_png + - cmp_gr_ph_spl_idnt_plot_png + - cmp_gr_ctyp_spl_cnd_plot_png + - cmp_gr_cnd_spl_ctyp_plot_png + - cmp_gr_ph_spl_ctyp_plot_png + - xpr_avg_plot_png + - xpr_dnst_plot_png + - xpr_per_cell_rd_rnaumap_plot_png + - xpr_per_cell_rd_atacumap_plot_png + - xpr_per_cell_rd_wnnumap_plot_png + - xpr_per_cell_sgnl_rd_rnaumap_plot_png + - xpr_per_cell_sgnl_rd_atacumap_plot_png + - xpr_per_cell_sgnl_rd_wnnumap_plot_png + - cvrg_plot_png + - xpr_htmp_plot_png + - gene_markers_tsv + - peak_markers_tsv + - ucsc_cb_config_data + - ucsc_cb_html_data + - ucsc_cb_html_file + - seurat_data_rds + - stdout_log + - stderr_log + + compress_cellbrowser_config_data: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: ctype_assign/ucsc_cb_config_data + out: + - compressed_folder + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Single-cell Manual Cell Type Assignment" +s:name: "Single-cell Manual Cell Type Assignment" +s:alternateName: "Assigns cell types for clusters based on the provided metadata file" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-ctype-assign.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell Manual Cell Type Assignment + + Assigns cell types for clusters based on the provided metadata file. \ No newline at end of file diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl new file mode 100644 index 00000000..8f1e5964 --- /dev/null +++ b/workflows/sc-multiome-filter.cwl @@ -0,0 +1,1405 @@ +cwlVersion: v1.1 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var split_numbers = function(line) { + let splitted_line = line?line.split(/[\s,]+/).map(parseFloat):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + + +'sd:upstream': + sc_arc_sample: + - "cellranger-arc-count.cwl" + - "cellranger-arc-aggr.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name/alias" + sd:preview: + position: 1 + + filtered_feature_bc_matrix_folder: + type: File + label: "Cell Ranger ARC Count/Aggregate Experiment" + doc: | + Path to the compressed folder with feature-barcode matrix from Cell Ranger ARC Count/Aggregate + experiment in MEX format. The rows consist of all the genes and peaks concatenated + together and the columns are restricted to those barcodes that are identified as cells. + 'sd:upstreamSource': "sc_arc_sample/filtered_feature_bc_matrix_folder" + 'sd:localLabel': true + + aggregation_metadata: + type: File? + label: "Cell Ranger ARC Count/Aggregate Experiment" + doc: | + Path to the metadata TSV/CSV file to set the datasets identities. If '--mex' points to + the Cell Ranger ARC Aggregate outputs, the aggr.csv file can be used. If input is not + provided, the default dummy_metadata.csv will be used instead. + 'sd:upstreamSource': "sc_arc_sample/aggregation_metadata" + + atac_fragments_file: + type: File + secondaryFiles: + - .tbi + label: "Cell Ranger ARC Count/Aggregate Experiment" + doc: | + Count and barcode information for every ATAC fragment observed in the experiment in TSV + format. Tbi-index file is required. + 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" + + annotation_gtf_file: + type: File + label: "Cell Ranger ARC Count/Aggregate Experiment" + doc: | + Path to the genome annotation file in GTF format. + 'sd:upstreamSource': "sc_arc_sample/genome_indices/genome_indices/annotation_gtf" + 'sd:localLabel': true + + grouping_data: + type: File? + label: "Optional TSV/CSV file to define datasets grouping with 'library_id' and 'condition' columns. Rows order should correspond to the aggregation metadata." + doc: | + Path to the TSV/CSV file to define datasets grouping. + First column - 'library_id' with the values and order + that correspond to the 'library_id' column from the ' + --identity' file, second column 'condition'. + Default: each dataset is assigned to its own group. + + blacklist_regions_file: + type: File? + label: "Optional BED file with the genomic blacklist regions" + doc: | + Path to the optional BED file with the genomic blacklist regions. + + barcodes_data: + type: File? + label: "Optional TSV/CSV file to prefilter and extend metadata be barcodes. First column should be named as 'barcode'" + doc: | + Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata be selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. + Default: all cells used, no extra metadata is added + + minimum_genes: + type: string? + default: "250" + label: "Include cells where at least this many genes are detected" + doc: | + Include cells where at least this many genes are detected. If multiple values + provided, each of them will be applied to the correspondent dataset from the + '--mex' input based on the '--identity' file. + Default: 250 (applied to all datasets) + 'sd:layout': + advanced: true + + maximum_genes: + type: string? + default: "5000" + label: "Include cells with the number of genes not bigger than this value" + doc: | + Include cells with the number of genes not bigger than this value. If multiple + values provided, each of them will be applied to the correspondent dataset from + the '--mex' input based on the '--identity' file. + Default: 5000 (applied to all datasets) + 'sd:layout': + advanced: true + + rna_minimum_umi: + type: string? + default: "500" + label: "Include cells where at least this many UMI (RNA transcripts) are detected" + doc: | + Include cells where at least this many UMI (RNA transcripts) are detected. + If multiple values provided, each of them will be applied to the correspondent + dataset from the '--mex' input based on the '--identity' file. + Default: 500 (applied to all datasets) + 'sd:layout': + advanced: true + + mito_pattern: + type: string? + default: "^mt-|^MT-" + label: "Regex pattern to identify mitochondrial genes" + doc: | + Regex pattern to identify mitochondrial genes. + Default: '^mt-|^MT-' + 'sd:layout': + advanced: true + + maximum_mito_perc: + type: float? + default: 5 + label: "Include cells with the percentage of transcripts mapped to mitochondrial genes not bigger than this value" + doc: | + Include cells with the percentage of transcripts mapped to mitochondrial + genes not bigger than this value. + Default: 5 (applied to all datasets) + 'sd:layout': + advanced: true + + minimum_novelty_score: + type: string? + default: "0.8" + label: "Include cells with the novelty score not lower than this value, calculated as log10(genes)/log10(UMI) for RNA assay" + doc: | + Include cells with the novelty score not lower than this value, calculated + as log10(genes)/log10(UMI) for RNA assay. If multiple values provided, each of them will + be applied to the correspondent dataset from the '--mex' input based on the + '--identity' file. + Default: 0.8 (applied to all datasets) + 'sd:layout': + advanced: true + + atac_minimum_umi: + type: string? + default: "1000" + label: "Include cells where at least this many UMI (ATAC transcripts) are detected" + doc: | + Include cells where at least this many UMI (ATAC transcripts) are detected. + If multiple values provided, each of them will be applied to the correspondent + dataset from the '--mex' input based on the '--identity' file. + Default: 1000 (applied to all datasets) + 'sd:layout': + advanced: true + + maximum_nucl_signal: + type: string? + default: "4" + label: "Include cells with the nucleosome signal not bigger than this value" + doc: | + Include cells with the nucleosome signal not bigger than this value. + Nucleosome signal quantifies the approximate ratio of mononucleosomal + to nucleosome-free fragments. If multiple values provided, each of + them will be applied to the correspondent dataset from the '--mex' input + based on the '--identity' file. + Default: 4 (applied to all datasets) + 'sd:layout': + advanced: true + + minimum_tss_enrich: + type: string? + default: "2" + label: "Include cells with the TSS enrichment score not lower than this value" + doc: | + Include cells with the TSS enrichment score not lower than this value. + Score is calculated based on the ratio of fragments centered at the TSS + to fragments in TSS-flanking regions. If multiple values provided, each + of them will be applied to the correspondent dataset from the '--mex' input + based on the '--identity' file. + Default: 2 (applied to all datasets) + 'sd:layout': + advanced: true + + minimum_frip: + type: string? + default: "0.15" + label: "Include cells with the FRiP not lower than this value" + doc: | + Include cells with the FRiP not lower than this value. If multiple values + provided, each of them will be applied to the correspondent dataset from the + '--mex' input based on the '--identity' file. FRiP is calculated for fragments. + Default: 0.15 (applied to all datasets) + 'sd:layout': + advanced: true + + maximum_blacklist_fraction: + type: string? + default: "0.05" + label: "Include cells with the fraction of fragments in genomic blacklist regions not bigger than this value" + doc: | + Include cells with the fraction of fragments in + genomic blacklist regions not bigger than this value. + If multiple values provided, each of them will be + applied to the correspondent dataset from the '--mex' + input based on the '--identity' file. + Default: 0.05 (applied to all datasets) + 'sd:layout': + advanced: true + + call_by: + type: string? + default: null + label: "Replace Cell Ranger ARC peaks with MACS2 peaks called for cells grouped by selected column" + doc: | + Replace Cell Ranger ARC peaks with MACS2 peaks called + for cells grouped by the column from the optionally + provided --barcodes file. If --barcodes file was not + provided MACS2 peaks can be still called per dataset + by setting --callby to new.ident. Peaks are called + only after applying all RNA related thresholds, + maximum nucleosome signal, and minimum TSS enrichment + scores filters. + Default: do not call peaks + 'sd:layout': + advanced: true + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + default: "classic" + label: "Color theme for all generated plots" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + 'sd:layout': + advanced: true + + parallel_memory_limit: + type: + - "null" + - type: enum + symbols: + - "32" + default: "32" + label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Forced to 32 GB + 'sd:layout': + advanced: true + + vector_memory_limit: + type: + - "null" + - type: enum + symbols: + - "96" + default: "96" + label: "Maximum vector memory in GB allowed to be used by R" + doc: | + Maximum vector memory in GB allowed to be used by R. + Forced to 96 GB + 'sd:layout': + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + default: "1" + label: "Number of cores/cpus to use" + doc: | + Number of cores/cpus to use. + Forced to 1 + 'sd:layout': + advanced: true + + +outputs: + + + raw_1_2_qc_mtrcs_pca_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_1_2_qc_mtrcs_pca_plot_png + label: "PC1 and PC2 from the QC metrics PCA (not filtered)" + doc: | + PC1 and PC2 from the QC metrics PCA (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'PC1 and PC2 from the QC metrics PCA' + + raw_2_3_qc_mtrcs_pca_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_2_3_qc_mtrcs_pca_plot_png + label: "PC2 and PC3 from the QC metrics PCA (not filtered)" + doc: | + PC2 and PC3 from the QC metrics PCA (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'PC2 and PC3 from the QC metrics PCA' + + raw_cells_count_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_cells_count_plot_png + label: "Number of cells per dataset (not filtered)" + doc: | + Number of cells per dataset (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Number of cells per dataset' + + raw_rna_umi_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_rna_umi_dnst_plot_png + label: "UMI per cell density for RNA assay (not filtered)" + doc: | + UMI per cell density for RNA assay (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'UMI per cell density for RNA assay' + + raw_gene_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_gene_dnst_plot_png + label: "Genes per cell density (not filtered)" + doc: | + Genes per cell density (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Genes per cell density' + + raw_gene_umi_corr_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_gene_umi_corr_plot_png + label: "Genes vs UMI per cell correlation for RNA assay (not filtered)" + doc: | + Genes vs UMI per cell correlation for RNA assay (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Genes vs UMI per cell correlation for RNA assay' + + raw_mito_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_mito_dnst_plot_png + label: "Percentage of transcripts mapped to mitochondrial genes per cell density (not filtered)" + doc: | + Percentage of transcripts mapped to mitochondrial genes per cell density (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Percentage of transcripts mapped to mitochondrial genes per cell density' + + raw_nvlt_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_nvlt_dnst_plot_png + label: "Novelty score per cell density for RNA assay (not filtered)" + doc: | + Novelty score per cell density for RNA assay (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Novelty score per cell density for RNA assay' + + raw_atac_umi_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_atac_umi_dnst_plot_png + label: "UMI per cell density for ATAC assay (not filtered)" + doc: | + UMI per cell density for ATAC assay (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'UMI per cell density for ATAC assay' + + raw_peak_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_peak_dnst_plot_png + label: "Peaks per cell density (not filtered)" + doc: | + Peaks per cell density (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Peaks per cell density' + + raw_blck_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_blck_dnst_plot_png + label: "Fraction of ATAC fragments within genomic blacklist regions per cell density (not filtered)" + doc: | + Fraction of ATAC fragments within genomic blacklist regions per cell density (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Fraction of ATAC fragments within genomic blacklist regions per cell density' + + raw_rna_atac_umi_corr_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_rna_atac_umi_corr_plot_png + label: "UMI per cell correlation for RNA vs ATAC assays (not filtered)" + doc: | + UMI per cell correlation for RNA vs ATAC assays (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'UMI per cell correlation for RNA vs ATAC assays' + + raw_tss_atac_umi_corr_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_tss_atac_umi_corr_plot_png + label: "TSS enrichment score vs UMI per cell correlation for ATAC assay (not filtered)" + doc: | + TSS enrichment score vs UMI per cell correlation for ATAC assay (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'TSS enrichment score vs UMI per cell correlation for ATAC assay' + + raw_qc_mtrcs_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_qc_mtrcs_dnst_plot_png + label: "QC metrics per cell density (not filtered)" + doc: | + QC metrics per cell density (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'QC metrics per cell density' + + raw_tss_nrch_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_tss_nrch_plot_png + label: "TSS enrichment score (not filtered)" + doc: | + TSS enrichment score (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'TSS enrichment score' + + raw_frgm_hist_png: + type: File? + outputSource: sc_multiome_filter/raw_frgm_hist_png + label: "Fragments length histogram (not filtered)" + doc: | + Fragments length histogram (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Fragments length histogram' + + raw_rna_umi_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_rna_umi_dnst_spl_cnd_plot_png + label: "Split by grouping condition UMI per cell density for RNA assay (not filtered)" + doc: | + Split by grouping condition UMI per cell density for RNA assay (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Split by grouping condition UMI per cell density for RNA assay' + + raw_gene_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_gene_dnst_spl_cnd_plot_png + label: "Split by grouping condition genes per cell density (not filtered)" + doc: | + Split by grouping condition genes per cell density (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Split by grouping condition genes per cell density' + + raw_mito_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_mito_dnst_spl_cnd_plot_png + label: "Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density (not filtered)" + doc: | + Split by grouping condition the percentage of transcripts mapped + to mitochondrial genes per cell density (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density' + + raw_nvlt_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_nvlt_dnst_spl_cnd_plot_png + label: "Split by grouping condition the novelty score per cell density for RNA assay (not filtered)" + doc: | + Split by grouping condition the novelty score per cell density for RNA assay (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Split by grouping condition the novelty score per cell density for RNA assay' + + raw_atac_umi_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_atac_umi_dnst_spl_cnd_plot_png + label: "Split by grouping condition UMI per cell density for ATAC assay (not filtered)" + doc: | + Split by grouping condition UMI per cell density for ATAC assay (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Split by grouping condition UMI per cell density for ATAC assay' + + raw_peak_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_peak_dnst_spl_cnd_plot_png + label: "Split by grouping condition peaks per cell density (not filtered)" + doc: | + Split by grouping condition peaks per cell density (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Split by grouping condition peaks per cell density' + + raw_blck_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_blck_dnst_spl_cnd_plot_png + label: "Split by grouping condition the fraction of ATAC fragments within genomic blacklist regions per cell density (not filtered)" + doc: | + Split by grouping condition the fraction of ATAC fragments within genomic + blacklist regions per cell density (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Split by grouping condition the fraction of ATAC fragments within genomic blacklist regions per cell density' + + mid_fltr_1_2_qc_mtrcs_pca_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_1_2_qc_mtrcs_pca_plot_png + label: "PC1 and PC2 from the QC metrics PCA (intermediate filtered)" + doc: | + PC1 and PC2 from the QC metrics PCA (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'PC1 and PC2 from the QC metrics PCA' + + mid_fltr_2_3_qc_mtrcs_pca_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_2_3_qc_mtrcs_pca_plot_png + label: "PC2 and PC3 from the QC metrics PCA (intermediate filtered)" + doc: | + PC2 and PC3 from the QC metrics PCA (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'PC2 and PC3 from the QC metrics PCA' + + mid_fltr_cells_count_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_cells_count_plot_png + label: "Number of cells per dataset (intermediate filtered)" + doc: | + Number of cells per dataset (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'Number of cells per dataset' + + mid_fltr_rna_umi_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_rna_umi_dnst_plot_png + label: "UMI per cell density for RNA assay (intermediate filtered)" + doc: | + UMI per cell density for RNA assay (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'UMI per cell density for RNA assay' + + mid_fltr_gene_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_gene_dnst_plot_png + label: "Genes per cell density (intermediate filtered)" + doc: | + Genes per cell density (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'Genes per cell density' + + mid_fltr_gene_umi_corr_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_gene_umi_corr_plot_png + label: "Genes vs UMI per cell correlation for RNA assay (intermediate filtered)" + doc: | + Genes vs UMI per cell correlation for RNA assay (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'Genes vs UMI per cell correlation for RNA assay' + + mid_fltr_mito_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_mito_dnst_plot_png + label: "Percentage of transcripts mapped to mitochondrial genes per cell density (intermediate filtered)" + doc: | + Percentage of transcripts mapped to mitochondrial genes per cell density (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'Percentage of transcripts mapped to mitochondrial genes per cell density' + + mid_fltr_nvlt_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_nvlt_dnst_plot_png + label: "Novelty score per cell density for RNA assay (intermediate filtered)" + doc: | + Novelty score per cell density for RNA assay (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'Novelty score per cell density for RNA assay' + + mid_fltr_atac_umi_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_atac_umi_dnst_plot_png + label: "UMI per cell density for ATAC assay (intermediate filtered)" + doc: | + UMI per cell density for ATAC assay (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'UMI per cell density for ATAC assay' + + mid_fltr_peak_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_peak_dnst_plot_png + label: "Peaks per cell density (intermediate filtered)" + doc: | + Peaks per cell density (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'Peaks per cell density' + + mid_fltr_blck_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_blck_dnst_plot_png + label: "Fraction of ATAC fragments within genomic blacklist regions per cell density (intermediate filtered)" + doc: | + Fraction of ATAC fragments within genomic blacklist regions per cell density (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'Fraction of ATAC fragments within genomic blacklist regions per cell density' + + mid_fltr_rna_atac_umi_corr_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_rna_atac_umi_corr_plot_png + label: "UMI per cell correlation for RNA vs ATAC assays (intermediate filtered)" + doc: | + UMI per cell correlation for RNA vs ATAC assays (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'UMI per cell correlation for RNA vs ATAC assays' + + mid_fltr_tss_atac_umi_corr_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_tss_atac_umi_corr_plot_png + label: "TSS enrichment score vs UMI per cell correlation for ATAC assay (intermediate filtered)" + doc: | + TSS enrichment score vs UMI per cell correlation for ATAC assay (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'TSS enrichment score vs UMI per cell correlation for ATAC assay' + + mid_fltr_qc_mtrcs_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_qc_mtrcs_dnst_plot_png + label: "QC metrics per cell density (intermediate filtered)" + doc: | + QC metrics per cell density (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'QC metrics per cell density' + + mid_fltr_tss_nrch_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_tss_nrch_plot_png + label: "TSS enrichment score (intermediate filtered)" + doc: | + TSS enrichment score (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'TSS enrichment score' + + mid_fltr_frgm_hist_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_frgm_hist_png + label: "Fragments length histogram (intermediate filtered)" + doc: | + Fragments length histogram (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'Fragments length histogram' + + mid_fltr_rna_umi_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_rna_umi_dnst_spl_cnd_plot_png + label: "Split by grouping condition UMI per cell density for RNA assay (intermediate filtered)" + doc: | + Split by grouping condition UMI per cell density for RNA assay (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'Split by grouping condition UMI per cell density for RNA assay' + + mid_fltr_gene_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_gene_dnst_spl_cnd_plot_png + label: "Split by grouping condition genes per cell density (intermediate filtered)" + doc: | + Split by grouping condition genes per cell density (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'Split by grouping condition genes per cell density' + + mid_fltr_mito_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_mito_dnst_spl_cnd_plot_png + label: "Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density (intermediate filtered)" + doc: | + Split by grouping condition the percentage of transcripts mapped + to mitochondrial genes per cell density (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density' + + mid_fltr_nvlt_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_nvlt_dnst_spl_cnd_plot_png + label: "Split by grouping condition the novelty score per cell density for RNA assay (intermediate filtered)" + doc: | + Split by grouping condition the novelty score per cell density for RNA assay (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'Split by grouping condition the novelty score per cell density for RNA assay' + + mid_fltr_atac_umi_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_atac_umi_dnst_spl_cnd_plot_png + label: "Split by grouping condition UMI per cell density for ATAC assay (intermediate filtered)" + doc: | + Split by grouping condition UMI per cell density for ATAC assay (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'Split by grouping condition UMI per cell density for ATAC assay' + + mid_fltr_peak_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_peak_dnst_spl_cnd_plot_png + label: "Split by grouping condition peaks per cell density (intermediate filtered)" + doc: | + Split by grouping condition peaks per cell density (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'Split by grouping condition peaks per cell density' + + mid_fltr_blck_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_blck_dnst_spl_cnd_plot_png + label: "Split by grouping condition the fraction of ATAC fragments within genomic blacklist regions per cell density (intermediate filtered)" + doc: | + Split by grouping condition the fraction of ATAC fragments within genomic + blacklist regions per cell density (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'Split by grouping condition the fraction of ATAC fragments within genomic blacklist regions per cell density' + + fltr_1_2_qc_mtrcs_pca_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_1_2_qc_mtrcs_pca_plot_png + label: "PC1 and PC2 from the QC metrics PCA (filtered)" + doc: | + PC1 and PC2 from the QC metrics PCA (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'PC1 and PC2 from the QC metrics PCA' + + fltr_2_3_qc_mtrcs_pca_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_2_3_qc_mtrcs_pca_plot_png + label: "PC2 and PC3 from the QC metrics PCA (filtered)" + doc: | + PC2 and PC3 from the QC metrics PCA (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'PC2 and PC3 from the QC metrics PCA' + + fltr_cells_count_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_cells_count_plot_png + label: "Number of cells per dataset (filtered)" + doc: | + Number of cells per dataset (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Number of cells per dataset' + + fltr_rna_umi_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_rna_umi_dnst_plot_png + label: "UMI per cell density for RNA assay (filtered)" + doc: | + UMI per cell density for RNA assay (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'UMI per cell density for RNA assay' + + fltr_gene_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_gene_dnst_plot_png + label: "Genes per cell density (filtered)" + doc: | + Genes per cell density (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Genes per cell density' + + fltr_gene_umi_corr_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_gene_umi_corr_plot_png + label: "Genes vs UMI per cell correlation for RNA assay (filtered)" + doc: | + Genes vs UMI per cell correlation for RNA assay (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Genes vs UMI per cell correlation for RNA assay' + + fltr_mito_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_mito_dnst_plot_png + label: "Percentage of transcripts mapped to mitochondrial genes per cell density (filtered)" + doc: | + Percentage of transcripts mapped to mitochondrial genes per cell density (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Percentage of transcripts mapped to mitochondrial genes per cell density' + + fltr_nvlt_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_nvlt_dnst_plot_png + label: "Novelty score per cell density for RNA assay (filtered)" + doc: | + Novelty score per cell density for RNA assay (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Novelty score per cell density for RNA assay' + + fltr_atac_umi_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_atac_umi_dnst_plot_png + label: "UMI per cell density for ATAC assay (filtered)" + doc: | + UMI per cell density for ATAC assay (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'UMI per cell density for ATAC assay' + + fltr_peak_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_peak_dnst_plot_png + label: "Peaks per cell density (filtered)" + doc: | + Peaks per cell density (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Peaks per cell density' + + fltr_blck_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_blck_dnst_plot_png + label: "Fraction of ATAC fragments within genomic blacklist regions per cell density (filtered)" + doc: | + Fraction of ATAC fragments within genomic blacklist regions per cell density (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Fraction of ATAC fragments within genomic blacklist regions per cell density' + + fltr_rna_atac_umi_corr_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_rna_atac_umi_corr_plot_png + label: "UMI per cell correlation for RNA vs ATAC assays (filtered)" + doc: | + UMI per cell correlation for RNA vs ATAC assays (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'UMI per cell correlation for RNA vs ATAC assays' + + fltr_tss_atac_umi_corr_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_tss_atac_umi_corr_plot_png + label: "TSS enrichment score vs UMI per cell correlation for ATAC assay (filtered)" + doc: | + TSS enrichment score vs UMI per cell correlation for ATAC assay (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'TSS enrichment score vs UMI per cell correlation for ATAC assay' + + fltr_qc_mtrcs_dnst_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_qc_mtrcs_dnst_plot_png + label: "QC metrics per cell density (filtered)" + doc: | + QC metrics per cell density (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'QC metrics per cell density' + + fltr_tss_nrch_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_tss_nrch_plot_png + label: "TSS enrichment score (filtered)" + doc: | + TSS enrichment score (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'TSS enrichment score' + + fltr_frgm_hist_png: + type: File? + outputSource: sc_multiome_filter/fltr_frgm_hist_png + label: "Fragments length histogram (filtered)" + doc: | + Fragments length histogram (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Fragments length histogram' + + fltr_rna_umi_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_rna_umi_dnst_spl_cnd_plot_png + label: "Split by grouping condition UMI per cell density for RNA assay (filtered)" + doc: | + Split by grouping condition UMI per cell density for RNA assay (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Split by grouping condition UMI per cell density for RNA assay' + + fltr_gene_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_gene_dnst_spl_cnd_plot_png + label: "Split by grouping condition genes per cell density (filtered)" + doc: | + Split by grouping condition genes per cell density (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Split by grouping condition genes per cell density' + + fltr_mito_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_mito_dnst_spl_cnd_plot_png + label: "Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density (filtered)" + doc: | + Split by grouping condition the percentage of transcripts mapped + to mitochondrial genes per cell density (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density' + + fltr_nvlt_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_nvlt_dnst_spl_cnd_plot_png + label: "Split by grouping condition the novelty score per cell density for RNA assay (filtered)" + doc: | + Split by grouping condition the novelty score per cell density for RNA assay (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Split by grouping condition the novelty score per cell density for RNA assay' + + fltr_atac_umi_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_atac_umi_dnst_spl_cnd_plot_png + label: "Split by grouping condition UMI per cell density for ATAC assay (filtered)" + doc: | + Split by grouping condition UMI per cell density for ATAC assay (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Split by grouping condition UMI per cell density for ATAC assay' + + fltr_peak_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_peak_dnst_spl_cnd_plot_png + label: "Split by grouping condition peaks per cell density (filtered)" + doc: | + Split by grouping condition peaks per cell density (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Split by grouping condition peaks per cell density' + + fltr_blck_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_blck_dnst_spl_cnd_plot_png + label: "Split by grouping condition the fraction of ATAC fragments within genomic blacklist regions per cell density (filtered)" + doc: | + Split by grouping condition the fraction of ATAC fragments within genomic + blacklist regions per cell density (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Split by grouping condition the fraction of ATAC fragments within genomic blacklist regions per cell density' + + ucsc_cb_config_data: + type: File + outputSource: compress_cellbrowser_config_data/compressed_folder + label: "Compressed directory with UCSC Cellbrowser configuration data" + doc: | + Compressed directory with UCSC Cellbrowser configuration data. + + ucsc_cb_html_data: + type: Directory + outputSource: sc_multiome_filter/ucsc_cb_html_data + label: "Directory with UCSC Cellbrowser html data" + doc: | + Directory with UCSC Cellbrowser html data. + + ucsc_cb_html_file: + type: File + outputSource: sc_multiome_filter/ucsc_cb_html_file + label: "Open in UCSC Cell Browser" + doc: | + HTML index file from the directory with UCSC Cellbrowser html data. + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + seurat_data_rds: + type: File + outputSource: sc_multiome_filter/seurat_data_rds + label: "Processed Seurat data in RDS format" + doc: | + Processed Seurat data in RDS format + + sc_multiome_filter_stdout_log: + type: File + outputSource: sc_multiome_filter/stdout_log + label: "stdout log generated by sc_multiome_filter step" + doc: | + stdout log generated by sc_multiome_filter step + + sc_multiome_filter_stderr_log: + type: File + outputSource: sc_multiome_filter/stderr_log + label: "stderr log generated by sc_multiome_filter step" + doc: | + stderr log generated by sc_multiome_filter step + + +steps: + + uncompress_feature_bc_matrices: + doc: | + Extracts the content of TAR file into a folder + run: ../tools/tar-extract.cwl + in: + file_to_extract: filtered_feature_bc_matrix_folder + out: + - extracted_folder + + sc_multiome_filter: + doc: | + Filters single-cell multiome ATAC and RNA-Seq datasets + based on the common QC metrics + run: ../tools/sc-multiome-filter.cwl + in: + feature_bc_matrices_folder: uncompress_feature_bc_matrices/extracted_folder + aggregation_metadata: aggregation_metadata + atac_fragments_file: atac_fragments_file + annotation_gtf_file: annotation_gtf_file + grouping_data: grouping_data + blacklist_regions_file: blacklist_regions_file + barcodes_data: barcodes_data + rna_minimum_cells: + default: 1 + minimum_genes: + source: minimum_genes + valueFrom: $(split_numbers(self)) + maximum_genes: + source: maximum_genes + valueFrom: $(split_numbers(self)) + rna_minimum_umi: + source: rna_minimum_umi + valueFrom: $(split_numbers(self)) + mito_pattern: mito_pattern + maximum_mito_perc: maximum_mito_perc + minimum_novelty_score: + source: minimum_novelty_score + valueFrom: $(split_numbers(self)) + atac_minimum_cells: + default: 1 + atac_minimum_umi: + source: atac_minimum_umi + valueFrom: $(split_numbers(self)) + maximum_nucl_signal: + source: maximum_nucl_signal + valueFrom: $(split_numbers(self)) + minimum_tss_enrich: + source: minimum_tss_enrich + valueFrom: $(split_numbers(self)) + minimum_frip: + source: minimum_frip + valueFrom: $(split_numbers(self)) + maximum_blacklist_fraction: + source: maximum_blacklist_fraction + valueFrom: $(split_numbers(self)) + call_by: call_by + verbose: + default: true + export_ucsc_cb: + default: true + color_theme: color_theme + parallel_memory_limit: + source: parallel_memory_limit + valueFrom: $(parseInt(self)) + vector_memory_limit: + source: vector_memory_limit + valueFrom: $(parseInt(self)) + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - raw_1_2_qc_mtrcs_pca_plot_png + - raw_2_3_qc_mtrcs_pca_plot_png + - raw_cells_count_plot_png + - raw_rna_umi_dnst_plot_png + - raw_gene_dnst_plot_png + - raw_gene_umi_corr_plot_png + - raw_mito_dnst_plot_png + - raw_nvlt_dnst_plot_png + - raw_atac_umi_dnst_plot_png + - raw_peak_dnst_plot_png + - raw_blck_dnst_plot_png + - raw_rna_atac_umi_corr_plot_png + - raw_tss_atac_umi_corr_plot_png + - raw_qc_mtrcs_dnst_plot_png + - raw_tss_nrch_plot_png + - raw_frgm_hist_png + - raw_rna_umi_dnst_spl_cnd_plot_png + - raw_gene_dnst_spl_cnd_plot_png + - raw_mito_dnst_spl_cnd_plot_png + - raw_nvlt_dnst_spl_cnd_plot_png + - raw_atac_umi_dnst_spl_cnd_plot_png + - raw_peak_dnst_spl_cnd_plot_png + - raw_blck_dnst_spl_cnd_plot_png + - mid_fltr_1_2_qc_mtrcs_pca_plot_png + - mid_fltr_2_3_qc_mtrcs_pca_plot_png + - mid_fltr_cells_count_plot_png + - mid_fltr_rna_umi_dnst_plot_png + - mid_fltr_gene_dnst_plot_png + - mid_fltr_gene_umi_corr_plot_png + - mid_fltr_mito_dnst_plot_png + - mid_fltr_nvlt_dnst_plot_png + - mid_fltr_atac_umi_dnst_plot_png + - mid_fltr_peak_dnst_plot_png + - mid_fltr_blck_dnst_plot_png + - mid_fltr_rna_atac_umi_corr_plot_png + - mid_fltr_tss_atac_umi_corr_plot_png + - mid_fltr_qc_mtrcs_dnst_plot_png + - mid_fltr_tss_nrch_plot_png + - mid_fltr_frgm_hist_png + - mid_fltr_rna_umi_dnst_spl_cnd_plot_png + - mid_fltr_gene_dnst_spl_cnd_plot_png + - mid_fltr_mito_dnst_spl_cnd_plot_png + - mid_fltr_nvlt_dnst_spl_cnd_plot_png + - mid_fltr_atac_umi_dnst_spl_cnd_plot_png + - mid_fltr_peak_dnst_spl_cnd_plot_png + - mid_fltr_blck_dnst_spl_cnd_plot_png + - fltr_1_2_qc_mtrcs_pca_plot_png + - fltr_2_3_qc_mtrcs_pca_plot_png + - fltr_cells_count_plot_png + - fltr_rna_umi_dnst_plot_png + - fltr_gene_dnst_plot_png + - fltr_gene_umi_corr_plot_png + - fltr_mito_dnst_plot_png + - fltr_nvlt_dnst_plot_png + - fltr_atac_umi_dnst_plot_png + - fltr_peak_dnst_plot_png + - fltr_blck_dnst_plot_png + - fltr_rna_atac_umi_corr_plot_png + - fltr_tss_atac_umi_corr_plot_png + - fltr_qc_mtrcs_dnst_plot_png + - fltr_tss_nrch_plot_png + - fltr_frgm_hist_png + - fltr_rna_umi_dnst_spl_cnd_plot_png + - fltr_gene_dnst_spl_cnd_plot_png + - fltr_mito_dnst_spl_cnd_plot_png + - fltr_nvlt_dnst_spl_cnd_plot_png + - fltr_atac_umi_dnst_spl_cnd_plot_png + - fltr_peak_dnst_spl_cnd_plot_png + - fltr_blck_dnst_spl_cnd_plot_png + - ucsc_cb_config_data + - ucsc_cb_html_data + - ucsc_cb_html_file + - seurat_data_rds + - stdout_log + - stderr_log + + compress_cellbrowser_config_data: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: sc_multiome_filter/ucsc_cb_config_data + out: + - compressed_folder + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Single-cell Multiome ATAC and RNA-Seq Filtering Analysis" +s:name: "Single-cell Multiome ATAC and RNA-Seq Filtering Analysis" +s:alternateName: "Filters single-cell multiome ATAC and RNA-Seq datasets based on the common QC metrics" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-multiome-filter.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell Multiome ATAC and RNA-Seq Filtering Analysis + + Filters single-cell multiome ATAC and RNA-Seq datasets + based on the common QC metrics. \ No newline at end of file diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl new file mode 100644 index 00000000..e57c0945 --- /dev/null +++ b/workflows/sc-rna-cluster.cwl @@ -0,0 +1,614 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var split_features = function(line) { + function get_unique(value, index, self) { + return self.indexOf(value) === index && value != ""; + } + let splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + - var split_numbers = function(line) { + let splitted_line = line?line.split(/[\s,]+/).map(parseFloat):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + + +'sd:upstream': + sc_tools_sample: + - "sc-rna-cluster.cwl" + - "sc-atac-cluster.cwl" + - "sc-rna-reduce.cwl" + - "sc-atac-reduce.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name/alias" + sd:preview: + position: 1 + + query_data_rds: + type: File + label: "Experiment run through Single-cell RNA-Seq Dimensionality Reduction Analysis" + doc: | + Path to the RDS file to load Seurat object from. This file should include genes + expression information stored in the RNA assay, as well as 'pca' and 'rnaumap' + dimensionality reductions applied to that assay. + 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" + 'sd:localLabel': true + + dimensions: + type: int? + default: 40 + label: "Dimensionality to use when constructing nearest-neighbor graph before clustering (from 1 to 50)" + doc: | + Dimensionality to use when constructing nearest- + neighbor graph before clustering (from 1 to 50). If + single value N is provided, use from 1 to N + dimensions. If multiple values are provided, subset to + only selected dimensions. + Default: from 1 to 10 + + cluster_algorithm: + type: + - "null" + - type: enum + symbols: + - "louvain" + - "mult-louvain" + - "slm" + - "leiden" + default: "louvain" + label: "Algorithm for modularity optimization when running clustering" + doc: | + Algorithm for modularity optimization when running clustering. + Default: louvain + + resolution: + type: float? + default: 0.3 + label: "Clustering resolution" + doc: | + Clustering resolution applied to the constructed nearest-neighbor graph. + Can be set as an array but only the first item from the list will be used + for cluster labels and gene markers in the UCSC Cell Browser when running + with --cbbuild and --diffgenes parameters. + Default: 0.3, 0.5, 1.0 + + genes_of_interest: + type: string? + default: null + label: "Comma or space separated list of genes of interest" + doc: | + Genes of interest to build genes expression plots. + Default: None + + identify_diff_genes: + type: boolean? + default: false + label: "Identify differentially expressed genes between each pair of clusters" + doc: | + Identify differentially expressed genes (putative gene markers) between each + pair of clusters for all resolutions. + Default: false + 'sd:layout': + advanced: true + + minimum_logfc: + type: float? + default: 0.25 + label: "Include only those genes that on average have log fold change difference in expression between every tested pair of clusters not lower than this value" + doc: | + For putative gene markers identification include only those genes that + on average have log fold change difference in expression between every + tested pair of clusters not lower than this value. Ignored if '--diffgenes' + is not set. + Default: 0.25 + 'sd:layout': + advanced: true + + minimum_pct: + type: float? + default: 0.1 + label: "Include only those genes that are detected in not lower than this fraction of cells in either of the two tested clusters" + doc: | + For putative gene markers identification include only those genes that + are detected in not lower than this fraction of cells in either of the + two tested clusters. Ignored if '--diffgenes' is not set. + Default: 0.1 + 'sd:layout': + advanced: true + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + default: "classic" + label: "Color theme for all generated plots" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + 'sd:layout': + advanced: true + + parallel_memory_limit: + type: + - "null" + - type: enum + symbols: + - "32" + default: "32" + label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Forced to 32 GB + 'sd:layout': + advanced: true + + vector_memory_limit: + type: + - "null" + - type: enum + symbols: + - "64" + default: "64" + label: "Maximum vector memory in GB allowed to be used by R" + doc: | + Maximum vector memory in GB allowed to be used by R. + Forced to 64 GB + 'sd:layout': + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + default: "1" + label: "Number of cores/cpus to use" + doc: | + Number of cores/cpus to use + Forced to 1 + 'sd:layout': + advanced: true + + +outputs: + + umap_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/umap_res_plot_png + label: "Clustered cells UMAP" + doc: | + Clustered cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Clustered cells UMAP' + + slh_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/slh_res_plot_png + label: "Silhouette scores. Downsampled to max 500 cells per cluster." + doc: | + Silhouette scores. Downsampled to max 500 cells per cluster. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Silhouette scores. Downsampled to max 500 cells per cluster.' + + umap_spl_idnt_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/umap_spl_idnt_res_plot_png + label: "Split by dataset clustered cells UMAP" + doc: | + Split by dataset clustered cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by dataset clustered cells UMAP' + + cmp_gr_clst_spl_idnt_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/cmp_gr_clst_spl_idnt_res_plot_png + label: "Grouped by cluster split by dataset cells composition plot. Downsampled." + doc: | + Grouped by cluster split by dataset cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Grouped by cluster split by dataset cells composition plot. Downsampled.' + + cmp_gr_idnt_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/cmp_gr_idnt_spl_clst_res_plot_png + label: "Grouped by dataset split by cluster cells composition plot. Downsampled." + doc: | + Grouped by dataset split by cluster cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Grouped by dataset split by cluster cells composition plot. Downsampled.' + + umap_spl_cnd_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/umap_spl_cnd_res_plot_png + label: "Split by grouping condition clustered cells UMAP" + doc: | + Split by grouping condition clustered cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Split by grouping condition clustered cells UMAP' + + cmp_gr_clst_spl_cnd_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/cmp_gr_clst_spl_cnd_res_plot_png + label: "Grouped by cluster split by condition cells composition plot. Downsampled." + doc: | + Grouped by cluster split by condition cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Grouped by cluster split by condition cells composition plot. Downsampled.' + + cmp_gr_cnd_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/cmp_gr_cnd_spl_clst_res_plot_png + label: "Grouped by condition split by cluster cells composition plot. Downsampled." + doc: | + Grouped by condition split by cluster cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Grouped by condition split by cluster cells composition plot. Downsampled.' + + umap_spl_ph_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/umap_spl_ph_res_plot_png + label: "Split by cell cycle phase clustered cells UMAP" + doc: | + Split by cell cycle phase clustered cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by cell cycle phase clustered cells UMAP' + + cmp_gr_ph_spl_idnt_plot_png: + type: File? + outputSource: sc_rna_cluster/cmp_gr_ph_spl_idnt_plot_png + label: "Grouped by cell cycle phase split by dataset cells composition plot. Downsampled." + doc: | + Grouped by cell cycle phase split by dataset cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Grouped by cell cycle phase split by dataset cells composition plot. Downsampled.' + + cmp_gr_ph_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/cmp_gr_ph_spl_clst_res_plot_png + label: "Grouped by cell cycle phase split by cluster cells composition plot. Downsampled." + doc: | + Grouped by cell cycle phase split by cluster cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Grouped by cell cycle phase split by cluster cells composition plot. Downsampled.' + + xpr_avg_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/xpr_avg_res_plot_png + label: "Log normalized scaled average gene expression per cluster" + doc: | + Log normalized scaled average gene expression per cluster. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized scaled average gene expression per cluster' + + xpr_per_cell_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/xpr_per_cell_plot_png + label: "Log normalized gene expression on cells UMAP" + doc: | + Log normalized gene expression on cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized gene expression on cells UMAP' + + xpr_per_cell_sgnl_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/xpr_per_cell_sgnl_plot_png + label: "Log normalized gene expression density on cells UMAP" + doc: | + Log normalized gene expression density on cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized gene expression density on cells UMAP' + + xpr_dnst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/xpr_dnst_res_plot_png + label: "Log normalized gene expression density per cluster" + doc: | + Log normalized gene expression density per cluster. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized gene expression density per cluster' + + xpr_htmp_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/xpr_htmp_res_plot_png + label: "Normalized gene expression heatmap grouped by cluster" + doc: | + Normalized gene expression heatmap grouped by cluster. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Normalized gene expression heatmap grouped by cluster' + + gene_markers_tsv: + type: File? + outputSource: sc_rna_cluster/gene_markers_tsv + label: "Differentially expressed genes between each pair of clusters" + doc: | + Differentially expressed genes between each pair of clusters for all resolutions. + TSV format + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'Gene markers' + Title: 'Differentially expressed genes between each pair of clusters' + + ucsc_cb_config_data: + type: File + outputSource: compress_cellbrowser_config_data/compressed_folder + label: "Compressed directory with UCSC Cellbrowser configuration data" + doc: | + Compressed directory with UCSC Cellbrowser configuration data. + + ucsc_cb_html_data: + type: Directory + outputSource: sc_rna_cluster/ucsc_cb_html_data + label: "Directory with UCSC Cellbrowser html data" + doc: | + Directory with UCSC Cellbrowser html data. + + ucsc_cb_html_file: + type: File + outputSource: sc_rna_cluster/ucsc_cb_html_file + label: "Open in UCSC Cell Browser" + doc: | + HTML index file from the directory with UCSC Cellbrowser html data. + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + seurat_data_rds: + type: File + outputSource: sc_rna_cluster/seurat_data_rds + label: "Processed Seurat data in RDS format" + doc: | + Processed Seurat data in RDS format + + sc_rna_cluster_stdout_log: + type: File + outputSource: sc_rna_cluster/stdout_log + label: "stdout log generated by sc_rna_cluster step" + doc: | + stdout log generated by sc_rna_cluster step + + sc_rna_cluster_stderr_log: + type: File + outputSource: sc_rna_cluster/stderr_log + label: "stderr log generated by sc_rna_cluster step" + doc: | + stderr log generated by sc_rna_cluster step + + +steps: + + sc_rna_cluster: + doc: | + Clusters single-cell RNA-Seq datasets, identifies gene markers + run: ../tools/sc-rna-cluster.cwl + in: + query_data_rds: query_data_rds + dimensions: dimensions + cluster_metric: + default: euclidean + cluster_algorithm: cluster_algorithm + resolution: resolution + genes_of_interest: + source: genes_of_interest + valueFrom: $(split_features(self)) + identify_diff_genes: identify_diff_genes + minimum_logfc: minimum_logfc + minimum_pct: minimum_pct + only_positive_diff_genes: + default: true + test_to_use: + default: wilcox + verbose: + default: true + export_ucsc_cb: + default: true + color_theme: color_theme + parallel_memory_limit: + source: parallel_memory_limit + valueFrom: $(parseInt(self)) + vector_memory_limit: + source: vector_memory_limit + valueFrom: $(parseInt(self)) + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - umap_res_plot_png + - slh_res_plot_png + - umap_spl_idnt_res_plot_png + - cmp_gr_clst_spl_idnt_res_plot_png + - cmp_gr_idnt_spl_clst_res_plot_png + - umap_spl_cnd_res_plot_png + - cmp_gr_clst_spl_cnd_res_plot_png + - cmp_gr_cnd_spl_clst_res_plot_png + - umap_spl_ph_res_plot_png + - cmp_gr_ph_spl_idnt_plot_png + - cmp_gr_ph_spl_clst_res_plot_png + - xpr_avg_res_plot_png + - xpr_per_cell_plot_png + - xpr_per_cell_sgnl_plot_png + - xpr_dnst_res_plot_png + - xpr_htmp_res_plot_png + - gene_markers_tsv + - ucsc_cb_config_data + - ucsc_cb_html_data + - ucsc_cb_html_file + - seurat_data_rds + - stdout_log + - stderr_log + + compress_cellbrowser_config_data: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: sc_rna_cluster/ucsc_cb_config_data + out: + - compressed_folder + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Single-cell RNA-Seq Cluster Analysis" +s:name: "Single-cell RNA-Seq Cluster Analysis" +s:alternateName: "Clusters single-cell RNA-Seq datasets, identifies gene markers" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-cluster.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell RNA-Seq Cluster Analysis + =============================================================== + Clusters single-cell RNA-Seq datasets, identifies gene markers. \ No newline at end of file diff --git a/workflows/sc-rna-da-cells.cwl b/workflows/sc-rna-da-cells.cwl new file mode 100644 index 00000000..753bef07 --- /dev/null +++ b/workflows/sc-rna-da-cells.cwl @@ -0,0 +1,473 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var split_numbers = function(line) { + let splitted_line = line?line.split(/[\s,]+/).map(parseFloat):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + + +'sd:upstream': + sc_tools_sample: + - "sc-rna-reduce.cwl" + - "sc-atac-reduce.cwl" + - "sc-rna-cluster.cwl" + - "sc-atac-cluster.cwl" + - "sc-wnn-cluster.cwl" + - "sc-ctype-assign.cwl" + - "sc-rna-de-pseudobulk.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name/alias" + sd:preview: + position: 1 + + query_data_rds: + type: File + label: "Experiment run through Single-cell RNA-Seq Dimensionality Reduction Analysis" + doc: | + Path to the RDS file to load Seurat object from. This file should include genes + expression information stored in the RNA assay and selected with the --reduction + parameter dimensionality reduction. Additionally, 'rnaumap', and/or 'atacumap', + and/or 'wnnumap' dimensionality reductions should be present. + 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" + 'sd:localLabel': true + + splitby: + type: string + label: "Column from the Seurat object metadata to split cells into two groups" + doc: | + Column from the Seurat object metadata to split cells into two groups + to run --second vs --first DA analysis. May include columns from the + extra metadata added with --metadata parameter. + + first_cond: + type: string + label: "Value from the Seurat object metadata column to define the first group of cells" + doc: | + Value from the Seurat object metadata column set with --splitby to define + the first group of cells for DA analysis. + + second_cond: + type: string + label: "Value from the Seurat object metadata column to define the second group of cells" + doc: | + Value from the Seurat object metadata column set with --splitby to define + the second group of cells for DA analysis. + + dimensions: + type: int? + default: 20 + label: "Dimensionality to use when running DA analysis (from 1 to 50)" + doc: | + Dimensionality to use when running DA analysis (from 1 to 50). + If single value N is provided, use from 1 to N PCs. If multiple + values are provided, subset to only selected PCs. + Default: from 1 to 10 + + resolution: + type: string? + default: "0.05 0.1 0.15" + label: "Clustering resolution applied to DA cells to identify DA cells populations" + doc: | + Clustering resolution applied to DA cells to identify DA cells populations. + Can be set as an array. + Default: 0.01, 0.03, 0.05 + + ranges: + type: string? + default: "-0.5 0.5" + label: " DA scores ranges for to filter out not significant cells" + doc: | + DA scores ranges for to filter out not significant cells. + Default: calculated based on the permutation test + + datasets_metadata: + type: File? + label: "Path to the TSV/CSV file to optionally extend Seurat object metadata" + doc: | + Path to the TSV/CSV file to optionally extend Seurat object metadata with + categorical values using samples identities. First column - 'library_id' + should correspond to all unique values from the 'new.ident' column of the + loaded Seurat object. If any of the provided in this file columns are already + present in the Seurat object metadata, they will be overwritten. + Default: no extra metadata is added + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + default: "classic" + label: "Color theme for all generated plots" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + 'sd:layout': + advanced: true + + parallel_memory_limit: + type: + - "null" + - type: enum + symbols: + - "32" + default: "32" + label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Forced to 32 GB + 'sd:layout': + advanced: true + + vector_memory_limit: + type: + - "null" + - type: enum + symbols: + - "64" + default: "64" + label: "Maximum vector memory in GB allowed to be used by R" + doc: | + Maximum vector memory in GB allowed to be used by R. + Forced to 64 GB + 'sd:layout': + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + default: "1" + label: "Number of cores/cpus to use" + doc: | + Number of cores/cpus to use + Forced to 1 + 'sd:layout': + advanced: true + + +outputs: + + da_perm_plot_png: + type: File? + outputSource: da_cells/da_perm_plot_png + label: "DA scores random permutations plot" + doc: | + DA scores random permutations plot for second + vs first biological conditions comparison. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'DA scores random permutations plot' + + umap_rd_rnaumap_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: da_cells/umap_rd_rnaumap_res_plot_png + label: "Clustered DA cells subpopulations RNA UMAP" + doc: | + Clustered DA cells subpopulations UMAP (rnaumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Clustered DA cells subpopulations RNA UMAP' + + umap_rd_atacumap_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: da_cells/umap_rd_atacumap_res_plot_png + label: "Clustered DA cells subpopulations ATAC UMAP" + doc: | + Clustered DA cells subpopulations UMAP (atacumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Clustered DA cells subpopulations ATAC UMAP' + + umap_rd_wnnumap_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: da_cells/umap_rd_wnnumap_res_plot_png + label: "Clustered DA cells subpopulations WNN UMAP" + doc: | + Clustered DA cells subpopulations UMAP (wnnumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Clustered DA cells subpopulations WNN UMAP' + + umap_spl_cnd_rd_rnaumap_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: da_cells/umap_spl_cnd_rd_rnaumap_res_plot_png + label: "Split by grouping condition clustered DA cells subpopulations RNA UMAP" + doc: | + Split by grouping condition clustered DA cells subpopulations UMAP + (rnaumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Split by grouping condition clustered DA cells subpopulations RNA UMAP' + + umap_spl_cnd_rd_atacumap_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: da_cells/umap_spl_cnd_rd_atacumap_res_plot_png + label: "Split by grouping condition clustered DA cells subpopulations ATAC UMAP" + doc: | + Split by grouping condition clustered DA cells subpopulations UMAP + (atacumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Split by grouping condition clustered DA cells subpopulations ATAC UMAP' + + umap_spl_cnd_rd_wnnumap_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: da_cells/umap_spl_cnd_rd_wnnumap_res_plot_png + label: "Split by grouping condition clustered DA cells subpopulations WNN UMAP" + doc: | + Split by grouping condition clustered DA cells subpopulations UMAP + (wnnumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Split by grouping condition clustered DA cells subpopulations WNN UMAP' + + umap_spl_idnt_rd_rnaumap_da_scr_plot_png: + type: File? + outputSource: da_cells/umap_spl_idnt_rd_rnaumap_da_scr_plot_png + label: "Split by dataset cells RNA UMAP with DA scores" + doc: | + Split by dataset cells UMAP with DA scores for second vs first + biological conditions comparison (rnaumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by dataset cells RNA UMAP with DA scores' + + umap_spl_idnt_rd_atacumap_da_scr_plot_png: + type: File? + outputSource: da_cells/umap_spl_idnt_rd_atacumap_da_scr_plot_png + label: "Split by dataset cells ATAC UMAP with DA scores" + doc: | + Split by dataset cells UMAP with DA scores for second vs first + biological conditions comparison (atacumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by dataset cells ATAC UMAP with DA scores' + + umap_spl_idnt_rd_wnnumap_da_scr_plot_png: + type: File? + outputSource: da_cells/umap_spl_idnt_rd_wnnumap_da_scr_plot_png + label: "Split by dataset cells WNN UMAP with DA scores" + doc: | + Split by dataset cells UMAP with DA scores for second vs first + biological conditions comparison (wnnumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by dataset cells WNN UMAP with DA scores' + + ucsc_cb_config_data: + type: File + outputSource: compress_cellbrowser_config_data/compressed_folder + label: "Compressed directory with UCSC Cellbrowser configuration data" + doc: | + Compressed directory with UCSC Cellbrowser configuration data. + + ucsc_cb_html_data: + type: Directory + outputSource: da_cells/ucsc_cb_html_data + label: "Directory with UCSC Cellbrowser html data" + doc: | + Directory with UCSC Cellbrowser html data. + + ucsc_cb_html_file: + type: File + outputSource: da_cells/ucsc_cb_html_file + label: "Open in UCSC Cell Browser" + doc: | + HTML index file from the directory with UCSC Cellbrowser html data. + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + seurat_data_rds: + type: File + outputSource: da_cells/seurat_data_rds + label: "Processed Seurat data in RDS format" + doc: | + Processed Seurat data in RDS format + + da_cells_stdout_log: + type: File + outputSource: da_cells/stdout_log + label: "stdout log generated by da_cells step" + doc: | + stdout log generated by da_cells step + + da_cells_stderr_log: + type: File + outputSource: da_cells/stderr_log + label: "stderr log generated by da_cells step" + doc: | + stderr log generated by da_cells step + + +steps: + + da_cells: + run: ../tools/sc-rna-da-cells.cwl + in: + query_data_rds: query_data_rds + datasets_metadata: datasets_metadata + dimensions: dimensions + splitby: splitby + first_cond: first_cond + second_cond: second_cond + resolution: + source: resolution + valueFrom: $(split_numbers(self)) + ranges: + source: ranges + valueFrom: $(split_numbers(self)) + verbose: + default: true + export_ucsc_cb: + default: true + color_theme: color_theme + parallel_memory_limit: + source: parallel_memory_limit + valueFrom: $(parseInt(self)) + vector_memory_limit: + source: vector_memory_limit + valueFrom: $(parseInt(self)) + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - da_perm_plot_png + - umap_rd_rnaumap_res_plot_png + - umap_rd_atacumap_res_plot_png + - umap_rd_wnnumap_res_plot_png + - umap_spl_cnd_rd_rnaumap_res_plot_png + - umap_spl_cnd_rd_atacumap_res_plot_png + - umap_spl_cnd_rd_wnnumap_res_plot_png + - umap_spl_idnt_rd_rnaumap_da_scr_plot_png + - umap_spl_idnt_rd_atacumap_da_scr_plot_png + - umap_spl_idnt_rd_wnnumap_da_scr_plot_png + - ucsc_cb_config_data + - ucsc_cb_html_data + - ucsc_cb_html_file + - seurat_data_rds + - stdout_log + - stderr_log + + compress_cellbrowser_config_data: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: da_cells/ucsc_cb_config_data + out: + - compressed_folder + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Single-cell Differential Abundance Analysis" +s:name: "Single-cell Differential Abundance Analysis" +s:alternateName: "Detects cell subpopulations with differential abundance between datasets split by biological condition" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-da-cells.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell Differential Abundance Analysis + + Detects cell subpopulations with differential abundance + between datasets split by biological condition. \ No newline at end of file diff --git a/workflows/sc-rna-de-pseudobulk.cwl b/workflows/sc-rna-de-pseudobulk.cwl new file mode 100644 index 00000000..ad588b5c --- /dev/null +++ b/workflows/sc-rna-de-pseudobulk.cwl @@ -0,0 +1,745 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var split_features = function(line) { + function get_unique(value, index, self) { + return self.indexOf(value) === index && value != ""; + } + var splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + + +'sd:upstream': + sc_tools_sample: + - "sc-rna-cluster.cwl" + - "sc-ctype-assign.cwl" + - "sc-wnn-cluster.cwl" + - "sc-rna-da-cells.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name/alias" + sd:preview: + position: 1 + + query_data_rds: + type: File + label: "Experiment run through any of the Single-cell Cluster or Manual Cell Type Assignment Analysis" + doc: | + Path to the RDS file to load Seurat object from. This file should include genes + expression information stored in the RNA assay. Additionally, 'rnaumap', and/or + 'atacumap', and/or 'wnnumap' dimensionality reductions should be present. + 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" + 'sd:localLabel': true + + splitby: + type: string + label: "Column from the Seurat object metadata to split datasets into two groups" + doc: | + Column from the Seurat object metadata to split datasets into two groups + to run --second vs --first pseudobulk DE analysis, i.e., calculate log2FC. + May be one of the columns from the extra metadata added with --metadata + parameter. Provided value should group the datasets, not cells, therefore + do not use a column with clustering results. + + first_cond: + type: string + label: "Value from the Seurat object metadata column to define the first group of datasets" + doc: | + Value from the Seurat object metadata column set with --splitby to define the + first group of datasets for pseudobulk DE analysis. + + second_cond: + type: string + label: "Value from the Seurat object metadata column to define the second group of datasets" + doc: | + Value from the Seurat object metadata column set with --splitby to define the + second group of datasets for pseudobulk DE analysis. + + batchby: + type: string? + default: null + label: "Column from the Seurat object metadata to group datasets into batches" + doc: | + Column from the Seurat object metadata to group datasets into batches. It will be used + as a factor variable to model batch effect when running pseudobulk DE analysis (makes + design formula look like ~splitby+batchby). May be one of the columns from the extra + metadata added with --metadata parameter. Provided value should batch the datasets, not + cells, therefore do not use a column with clustering results. Default: do not model + batch effect. + + groupby: + type: string? + default: null + label: "Column from the Seurat object metadata to group cells for optional subsetting" + doc: | + Column from the Seurat object metadata to group cells for optional subsetting + when combined with --subset parameter. May be one of the columns from the extra + metadata added with --metadata parameter. Ignored if --subset is not set. Provided + value defines the groups of cells, therefore any metadata column, including the + clustering results, may be used. Default: do not subset, run pseudobulk DE analysis + for all cells jointly + + subset: + type: string? + default: null + label: "Value(s) to subset cells before running analysis" + doc: | + Value(s) from the column set with --groupby parameter to subset cells + before running pseudobulk DE analysis. If multiple values are provided + run analysis jointly for selected groups of cells. Ignored if --groupby + is not set. Default: do not subset, run pseudobulk DE analysis for all + cells jointly + + datasets_metadata: + type: File? + label: "Path to the TSV/CSV file to optionally extend Seurat object metadata" + doc: | + Path to the TSV/CSV file to optionally extend Seurat object metadata with + categorical values using samples identities. First column - 'library_id' + should correspond to all unique values from the 'new.ident' column of the + loaded Seurat object. If any of the provided in this file columns are already + present in the Seurat object metadata, they will be overwritten. Default: no + extra metadata is added + + lrt: + type: boolean? + default: false + label: "Use LRT instead of the pair-wise Wald test" + doc: | + Use LRT instead of the pair-wise Wald test. If --batchby is not provided + use ~1 as a reduced formula, otherwise ~batchby. Default: use Wald test + 'sd:layout': + advanced: true + + maximum_padj: + type: float? + default: 0.05 + label: "Maximum significance level used in the exploratory visualization part of the analysis" + doc: | + In the exploratory visualization part of the analysis output only features + with adjusted P-value not bigger than this value. Default: 0.05 + 'sd:layout': + advanced: true + + genes_of_interest: + type: string? + default: null + label: "Genes of interest to label on the generated plots" + doc: | + Genes of interest to label on the generated plots. Default: top 10 genes + with the highest and the lowest log2FC expression values. + 'sd:layout': + advanced: true + + exclude_pattern: + type: string? + default: null + label: "Regex pattern to identify and exclude non-coding RNA genes from the analysis" + doc: | + Regex pattern to identify and exclude non-coding RNA genes from the pseudobulk + DE analysis (not case-sensitive). If any of such genes were provided in the --genes + parameter, they will be excluded from there as well. + 'sd:layout': + advanced: true + + normalization_method: + type: + - "null" + - type: enum + symbols: + - "vst" + - "rlog" + default: "rlog" + label: "Read counts normalization for the exploratory visualization part of the analysis" + doc: | + Read counts normalization for the exploratory visualization part of the analysis. + Use 'vst' for medium-to-large datasets (n > 30) and 'rlog' for small datasets + (n < 30), when there is a wide range of sequencing depth across samples. + Default: rlog + 'sd:layout': + advanced: true + + remove: + type: boolean? + default: false + label: "Remove batch effect when generating normalized read counts" + doc: | + Remove batch effect when generating normalized read counts for the exploratory + visualization part of the analysis. Ignored if --batchby is not provided. + Default: do not remove batch effect from normalized read counts. + 'sd:layout': + advanced: true + + center_row: + type: boolean? + default: false + label: "Apply mean centering for feature expression prior to running clustering by row" + doc: | + Apply mean centering for gene expression prior to running + clustering by row. Ignored if --cluster is set to column or + not provided. Default: do not centered + 'sd:layout': + advanced: true + + cluster_method: + type: + - "null" + - type: enum + symbols: + - "row" + - "column" + - "both" + - "none" + default: "none" + label: "Hopach clustering method to be run on normalized read counts" + doc: | + Hopach clustering method to be run on normalized read counts for the + exploratory visualization part of the analysis. Default: do not run + clustering + 'sd:layout': + advanced: true + + row_distance: + type: + - "null" + - type: enum + symbols: + - "cosangle" + - "abscosangle" + - "euclid" + - "abseuclid" + - "cor" + - "abscor" + default: "cosangle" + label: "Distance metric for HOPACH row clustering" + doc: | + Distance metric for HOPACH row clustering. Ignored if --cluster is set + to column or not provided. Default: cosangle + 'sd:layout': + advanced: true + + column_distance: + type: + - "null" + - type: enum + symbols: + - "cosangle" + - "abscosangle" + - "euclid" + - "abseuclid" + - "cor" + - "abscor" + default: "euclid" + label: "Distance metric for HOPACH column clustering" + doc: | + Distance metric for HOPACH column clustering. Ignored if --cluster is set + to row or not provided. Default: euclid + 'sd:layout': + advanced: true + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + default: "classic" + label: "Color theme for all generated plots" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + 'sd:layout': + advanced: true + + parallel_memory_limit: + type: + - "null" + - type: enum + symbols: + - "32" + default: "32" + label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Forced to 32 GB + 'sd:layout': + advanced: true + + vector_memory_limit: + type: + - "null" + - type: enum + symbols: + - "64" + default: "64" + label: "Maximum vector memory in GB allowed to be used by R" + doc: | + Maximum vector memory in GB allowed to be used by R. + Forced to 64 GB + 'sd:layout': + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + default: "1" + label: "Number of cores/cpus to use" + doc: | + Number of cores/cpus to use + Forced to 1 + 'sd:layout': + advanced: true + + +outputs: + + umap_rd_rnaumap_plot_png: + type: File? + outputSource: de_pseudobulk/umap_rd_rnaumap_plot_png + label: "Cells RNA UMAP split by selected biological condition" + doc: | + Cells UMAP split by selected biological condition, optionally + subsetted to the specific cluster or cell type (rnaumap dim. + reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Cells RNA UMAP split by selected biological condition' + + umap_rd_atacumap_plot_png: + type: File? + outputSource: de_pseudobulk/umap_rd_atacumap_plot_png + label: "Cells ATAC UMAP split by selected biological condition" + doc: | + Cells UMAP split by selected biological condition, optionally + subsetted to the specific cluster or cell type (atacumap dim. + reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Cells ATAC UMAP split by selected biological condition' + + umap_rd_wnnumap_plot_png: + type: File? + outputSource: de_pseudobulk/umap_rd_wnnumap_plot_png + label: "Cells WNN UMAP split by selected biological condition" + doc: | + Cells UMAP split by selected biological condition, optionally + subsetted to the specific cluster or cell type (wnnumap dim. + reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Cells WNN UMAP split by selected biological condition' + + mds_plot_html: + type: File? + outputSource: de_pseudobulk/mds_plot_html + label: "MDS plot of normalized counts" + doc: | + MDS plot of normalized counts. Optionally batch corrected + if --remove was set to True. + HTML format + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + volcano_plot_html_file: + type: File + outputSource: make_volcano_plot/html_file + label: "Volcano Plot" + doc: | + HTML index file for Volcano Plot + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + volcano_plot_html_data: + type: Directory + outputSource: make_volcano_plot/html_data + label: "Directory html data for Volcano Plot" + doc: | + Directory html data for Volcano Plot + + ma_plot_html_file: + type: File + outputSource: make_ma_plot/html_file + label: "MA-plot" + doc: | + HTML index file for MA-plot + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + ma_plot_html_data: + type: Directory + outputSource: make_ma_plot/html_data + label: "Directory html data for Volcano Plot" + doc: | + Directory html data for MA-plot + + heatmap_html: + type: File + outputSource: morpheus_heatmap/heatmap_html + label: "Heatmap of normalized counts" + doc: | + Morpheus heatmap in HTML format + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + pca_1_2_plot_png: + type: File? + outputSource: de_pseudobulk/pca_1_2_plot_png + label: "Normalized counts PCA (PC1 and PC2)" + doc: | + Normalized counts PCA (PC1 and PC2) subsetted to all DE genes regardless + of Padj, optionally batch corrected by the selected criteria. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Normalized counts PCA (PC1 and PC2)' + + pca_2_3_plot_png: + type: File? + outputSource: de_pseudobulk/pca_2_3_plot_png + label: "Normalized counts PCA (PC2 and PC3)" + doc: | + Normalized counts PCA (PC2 and PC3) subsetted to all DE genes regardless + of Padj, optionally batch corrected by the selected criteria. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Normalized counts PCA (PC2 and PC3)' + + dxpr_vlcn_plot_png: + type: File? + outputSource: de_pseudobulk/dxpr_vlcn_plot_png + label: "Volcano plot of differentially expressed genes" + doc: | + Volcano plot of differentially expressed genes. Highlighed genes are either + provided by user or top 10 genes with the highest log2FC values. The direction + of comparison is defined by --second vs --first groups of cells optionally + subsetted to the specific cluster or cell type and coerced to the pseudobulk + RNA-Seq samples. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Volcano plot of differentially expressed genes' + + xpr_dnst_plot_png: + type: + - "null" + - type: array + items: File + outputSource: de_pseudobulk/xpr_dnst_plot_png + label: "Log normalized gene expression density per dataset" + doc: | + Log normalized gene expression density per dataset optionally subsetted to the + specific cluster or cell type. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized gene expression density per dataset' + + xpr_per_cell_rd_rnaumap_plot_png: + type: + - "null" + - type: array + items: File + outputSource: de_pseudobulk/xpr_per_cell_rd_rnaumap_plot_png + label: "Log normalized gene expression on cells RNA UMAP per dataset" + doc: | + Log normalized gene expression on cells UMAP per dataset optionally subsetted + to the specific cluster or cell type (rnaumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized gene expression on cells RNA UMAP per dataset' + + xpr_per_cell_rd_atacumap_plot_png: + type: + - "null" + - type: array + items: File + outputSource: de_pseudobulk/xpr_per_cell_rd_atacumap_plot_png + label: "Log normalized gene expression on cells ATAC UMAP per dataset" + doc: | + Log normalized gene expression on cells UMAP per dataset optionally subsetted + to the specific cluster or cell type (atacumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized gene expression on cells ATAC UMAP per dataset' + + xpr_per_cell_rd_wnnumap_plot_png: + type: + - "null" + - type: array + items: File + outputSource: de_pseudobulk/xpr_per_cell_rd_wnnumap_plot_png + label: "Log normalized gene expression on cells WNN UMAP per dataset" + doc: | + Log normalized gene expression on cells UMAP per dataset optionally subsetted + to the specific cluster or cell type (wnnumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized gene expression on cells WNN UMAP per dataset' + + xpr_htmp_plot_png: + type: File? + outputSource: de_pseudobulk/xpr_htmp_plot_png + label: "Log normalized gene expression heatmap per dataset" + doc: | + Normalized gene expression heatmap optionally subsetted + to the specific cluster or cell type. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Normalized gene expression heatmap' + + diff_expr_genes: + type: File + outputSource: de_pseudobulk/diff_expr_genes + label: "Differentially expressed genes" + doc: | + Differentially expressed genes. + TSV format + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'Diff expressed genes' + Title: 'Differentially expressed genes' + + read_counts_file: + type: File + outputSource: de_pseudobulk/read_counts_gct + label: "GSEA compatible normalized counts" + doc: | + GSEA compatible normalized counts, optionally, batch corrected. + GCT format + + phenotypes_file: + type: File + outputSource: de_pseudobulk/phenotypes_cls + label: "GSEA compatible phenotypes file" + doc: | + GSEA compatible phenotypes file defined based on --splitby, --first, + and --second parameters. + CLS format + + de_pseudobulk_stdout_log: + type: File + outputSource: de_pseudobulk/stdout_log + label: "stdout log generated by de_pseudobulk step" + doc: | + stdout log generated by de_pseudobulk step + + de_pseudobulk_stderr_log: + type: File + outputSource: de_pseudobulk/stderr_log + label: "stderr log generated by de_pseudobulk step" + doc: | + stderr log generated by de_pseudobulk step + + morpheus_heatmap_stdout_log: + type: File + outputSource: morpheus_heatmap/stdout_log + label: "stdout log generated by morpheus_heatmap step" + doc: "stdout log generated by morpheus_heatmap step" + + morpheus_heatmap_stderr_log: + type: File + outputSource: morpheus_heatmap/stderr_log + label: "stderr log generated by morpheus_heatmap step" + doc: "stderr log generated by morpheus_heatmap step" + + +steps: + + de_pseudobulk: + run: ../tools/sc-rna-de-pseudobulk.cwl + in: + query_data_rds: query_data_rds + datasets_metadata: datasets_metadata + splitby: splitby + first_cond: first_cond + second_cond: second_cond + batchby: batchby + groupby: groupby + subset: + source: subset + valueFrom: $(split_features(self)) + lrt: lrt + maximum_padj: maximum_padj + genes_of_interest: + source: genes_of_interest + valueFrom: $(split_features(self)) + exclude_pattern: exclude_pattern + normalization_method: normalization_method + remove: remove + cluster_method: + source: cluster_method + valueFrom: $(self=="none"?null:self) + row_distance: row_distance + column_distance: column_distance + center_row: center_row + verbose: + default: true + color_theme: color_theme + parallel_memory_limit: + source: parallel_memory_limit + valueFrom: $(parseInt(self)) + vector_memory_limit: + source: vector_memory_limit + valueFrom: $(parseInt(self)) + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - umap_rd_rnaumap_plot_png + - umap_rd_atacumap_plot_png + - umap_rd_wnnumap_plot_png + - mds_plot_html + - pca_1_2_plot_png + - pca_2_3_plot_png + - dxpr_vlcn_plot_png + - xpr_dnst_plot_png + - xpr_per_cell_rd_rnaumap_plot_png + - xpr_per_cell_rd_atacumap_plot_png + - xpr_per_cell_rd_wnnumap_plot_png + - xpr_htmp_plot_png + - diff_expr_genes + - read_counts_gct + - phenotypes_cls + - stdout_log + - stderr_log + + morpheus_heatmap: + run: ../tools/morpheus-heatmap.cwl + in: + read_counts_gct: de_pseudobulk/read_counts_gct + out: + - heatmap_html + - stdout_log + - stderr_log + + make_volcano_plot: + run: ../tools/volcano-plot.cwl + in: + diff_expr_file: de_pseudobulk/diff_expr_genes + x_axis_column: + default: "log2FoldChange" + y_axis_column: + default: "padj" + label_column: + default: "gene" + out: + - html_data + - html_file + + make_ma_plot: + run: ../tools/ma-plot.cwl + in: + diff_expr_file: de_pseudobulk/diff_expr_genes + x_axis_column: + default: "baseMean" + y_axis_column: + default: "log2FoldChange" + label_column: + default: "gene" + out: + - html_data + - html_file + + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Single-cell Pseudobulk Differential Expression Analysis Between Datasets" +s:name: "Single-cell Pseudobulk Differential Expression Analysis Between Datasets" +s:alternateName: "Identifies differentially expressed genes between groups of cells coerced to pseudobulk datasets" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-de-pseudobulk.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell Pseudobulk Differential Expression Analysis Between Datasets + + Identifies differentially expressed genes between groups of cells + coerced to pseudobulk datasets. \ No newline at end of file diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl new file mode 100644 index 00000000..be2e14cc --- /dev/null +++ b/workflows/sc-rna-filter.cwl @@ -0,0 +1,712 @@ +cwlVersion: v1.1 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var split_numbers = function(line) { + let splitted_line = line?line.split(/[\s,]+/).map(parseFloat):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + + +'sd:upstream': + sc_rnaseq_sample: + - "cellranger-aggr.cwl" + - "single-cell-preprocess-cellranger.cwl" + - "cellranger-multi.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name/alias" + sd:preview: + position: 1 + + filtered_feature_bc_matrix_folder: + type: File + label: "Cell Ranger Count/Aggregate Experiment" + doc: | + Path to the compressed folder with feature-barcode matrix from Cell Ranger Count/Aggregate + experiment in MEX format. + 'sd:upstreamSource': "sc_rnaseq_sample/filtered_feature_bc_matrix_folder" + 'sd:localLabel': true + + aggregation_metadata: + type: File? + label: "Cell Ranger Count/Aggregate Experiment" + doc: | + Path to the metadata TSV/CSV file to set the datasets identities. If '--mex' points to + the Cell Ranger Aggregate outputs, the aggregation.csv file can be used. If input is not + provided, the default dummy_metadata.csv will be used instead. + 'sd:upstreamSource': "sc_rnaseq_sample/aggregation_metadata" + 'sd:localLabel': true + + grouping_data: + type: File? + label: "Optional TSV/CSV file to define datasets grouping with 'library_id' and 'condition' columns. Rows order should correspond to the aggregation metadata." + doc: | + Path to the TSV/CSV file to define datasets grouping. + First column - 'library_id' with the values and order + that correspond to the 'library_id' column from the ' + --identity' file, second column 'condition'. + Default: each dataset is assigned to its own group. + + barcodes_data: + type: File? + label: "Optional TSV/CSV file to prefilter and extend metadata be barcodes. First column should be named as 'barcode'" + doc: | + Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata be selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. + Default: all cells used, no extra metadata is added + + minimum_genes: + type: string? + default: "250" + label: "Include cells where at least this many genes are detected" + doc: | + Include cells where at least this many genes are detected. If multiple values + provided, each of them will be applied to the correspondent dataset from the + '--mex' input based on the '--identity' file. + Default: 250 (applied to all datasets) + 'sd:layout': + advanced: true + + maximum_genes: + type: string? + default: "5000" + label: "Include cells with the number of genes not bigger than this value" + doc: | + Include cells with the number of genes not bigger than this value. If multiple + values provided, each of them will be applied to the correspondent dataset from + the '--mex' input based on the '--identity' file. + Default: 5000 (applied to all datasets) + 'sd:layout': + advanced: true + + rna_minimum_umi: + type: string? + default: "500" + label: "Include cells where at least this many UMI (transcripts) are detected" + doc: | + Include cells where at least this many UMI (transcripts) are detected. + If multiple values provided, each of them will be applied to the correspondent + dataset from the '--mex' input based on the '--identity' file. + Default: 500 (applied to all datasets) + 'sd:layout': + advanced: true + + minimum_novelty_score: + type: string? + default: "0.8" + label: "Include cells with the novelty score not lower than this value, calculated as log10(genes)/log10(UMI)" + doc: | + Include cells with the novelty score not lower than this value, calculated + as log10(genes)/log10(UMI). If multiple values provided, each of them will + be applied to the correspondent dataset from the '--mex' input based on the + '--identity' file. + Default: 0.8 (applied to all datasets) + 'sd:layout': + advanced: true + + mito_pattern: + type: string? + default: "^mt-|^MT-" + label: "Regex pattern to identify mitochondrial genes" + doc: | + Regex pattern to identify mitochondrial genes. + Default: '^mt-|^MT-' + 'sd:layout': + advanced: true + + maximum_mito_perc: + type: float? + default: 5 + label: "Include cells with the percentage of transcripts mapped to mitochondrial genes not bigger than this value" + doc: | + Include cells with the percentage of transcripts mapped to mitochondrial + genes not bigger than this value. + Default: 5 (applied to all datasets) + 'sd:layout': + advanced: true + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + default: "classic" + label: "Color theme for all generated plots" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + 'sd:layout': + advanced: true + + parallel_memory_limit: + type: + - "null" + - type: enum + symbols: + - "32" + default: "32" + label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Forced to 32 GB + 'sd:layout': + advanced: true + + vector_memory_limit: + type: + - "null" + - type: enum + symbols: + - "32" + default: "32" + label: "Maximum vector memory in GB allowed to be used by R" + doc: | + Maximum vector memory in GB allowed to be used by R. + Forced to 32 GB + 'sd:layout': + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + default: "1" + label: "Number of cores/cpus to use" + doc: | + Number of cores/cpus to use + Forced to 1 + 'sd:layout': + advanced: true + + +outputs: + + raw_1_2_qc_mtrcs_pca_plot_png: + type: File? + outputSource: sc_rna_filter/raw_1_2_qc_mtrcs_pca_plot_png + label: "PC1 and PC2 from the QC metrics PCA (not filtered)" + doc: | + PC1 and PC2 from the QC metrics PCA (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'PC1 and PC2 from the QC metrics PCA' + + raw_2_3_qc_mtrcs_pca_plot_png: + type: File? + outputSource: sc_rna_filter/raw_2_3_qc_mtrcs_pca_plot_png + label: "PC2 and PC3 from the QC metrics PCA (not filtered)" + doc: | + PC2 and PC3 from the QC metrics PCA (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'PC2 and PC3 from the QC metrics PCA' + + raw_cells_count_plot_png: + type: File? + outputSource: sc_rna_filter/raw_cells_count_plot_png + label: "Number of cells per dataset (not filtered)" + doc: | + Number of cells per dataset (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Number of cells per dataset' + + raw_umi_dnst_plot_png: + type: File? + outputSource: sc_rna_filter/raw_umi_dnst_plot_png + label: "UMI per cell density (not filtered)" + doc: | + UMI per cell density (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'UMI per cell density' + + raw_gene_dnst_plot_png: + type: File? + outputSource: sc_rna_filter/raw_gene_dnst_plot_png + label: "Genes per cell density (not filtered)" + doc: | + Genes per cell density (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Genes per cell density' + + raw_gene_umi_corr_plot_png: + type: File? + outputSource: sc_rna_filter/raw_gene_umi_corr_plot_png + label: "Genes vs UMI per cell correlation (not filtered)" + doc: | + Genes vs UMI per cell correlation (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Genes vs UMI per cell correlation' + + raw_mito_dnst_plot_png: + type: File? + outputSource: sc_rna_filter/raw_mito_dnst_plot_png + label: "Percentage of transcripts mapped to mitochondrial genes per cell density (not filtered)" + doc: | + Percentage of transcripts mapped to mitochondrial genes per cell density (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Percentage of transcripts mapped to mitochondrial genes per cell density' + + raw_nvlt_dnst_plot_png: + type: File? + outputSource: sc_rna_filter/raw_nvlt_dnst_plot_png + label: "Novelty score per cell density (not filtered)" + doc: | + Novelty score per cell density (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Novelty score per cell density' + + raw_qc_mtrcs_dnst_plot_png: + type: File? + outputSource: sc_rna_filter/raw_qc_mtrcs_dnst_plot_png + label: "QC metrics per cell density (not filtered)" + doc: | + QC metrics per cell density (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'QC metrics per cell density' + + raw_umi_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_rna_filter/raw_umi_dnst_spl_cnd_plot_png + label: "Split by grouping condition UMI per cell density (not filtered)" + doc: | + Split by grouping condition UMI per cell density (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Split by grouping condition UMI per cell density' + + raw_gene_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_rna_filter/raw_gene_dnst_spl_cnd_plot_png + label: "Split by grouping condition genes per cell density (not filtered)" + doc: | + Split by grouping condition genes per cell density (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Split by grouping condition genes per cell density' + + raw_mito_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_rna_filter/raw_mito_dnst_spl_cnd_plot_png + label: "Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density (not filtered)" + doc: | + Split by grouping condition the percentage of transcripts mapped + to mitochondrial genes per cell density (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density' + + raw_nvlt_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_rna_filter/raw_nvlt_dnst_spl_cnd_plot_png + label: "Split by grouping condition the novelty score per cell density (not filtered)" + doc: | + Split by grouping condition the novelty score per cell density (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Split by grouping condition the novelty score per cell density' + + fltr_1_2_qc_mtrcs_pca_plot_png: + type: File? + outputSource: sc_rna_filter/fltr_1_2_qc_mtrcs_pca_plot_png + label: "PC1 and PC2 from the QC metrics PCA (filtered)" + doc: | + PC1 and PC2 from the QC metrics PCA (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'PC1 and PC2 from the QC metrics PCA' + + fltr_2_3_qc_mtrcs_pca_plot_png: + type: File? + outputSource: sc_rna_filter/fltr_2_3_qc_mtrcs_pca_plot_png + label: "PC2 and PC3 from the QC metrics PCA (filtered)" + doc: | + PC2 and PC3 from the QC metrics PCA (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'PC2 and PC3 from the QC metrics PCA' + + fltr_cells_count_plot_png: + type: File? + outputSource: sc_rna_filter/fltr_cells_count_plot_png + label: "Number of cells per dataset (filtered)" + doc: | + Number of cells per dataset (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Number of cells per dataset' + + fltr_umi_dnst_plot_png: + type: File? + outputSource: sc_rna_filter/fltr_umi_dnst_plot_png + label: "UMI per cell density (filtered)" + doc: | + UMI per cell density (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'UMI per cell density' + + fltr_gene_dnst_plot_png: + type: File? + outputSource: sc_rna_filter/fltr_gene_dnst_plot_png + label: "Genes per cell density (filtered)" + doc: | + Genes per cell density (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Genes per cell density' + + fltr_gene_umi_corr_plot_png: + type: File? + outputSource: sc_rna_filter/fltr_gene_umi_corr_plot_png + label: "Genes vs UMI per cell correlation (filtered)" + doc: | + Genes vs UMI per cell correlation (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Genes vs UMI per cell correlation' + + fltr_mito_dnst_plot_png: + type: File? + outputSource: sc_rna_filter/fltr_mito_dnst_plot_png + label: "Percentage of transcripts mapped to mitochondrial genes per cell density (filtered)" + doc: | + Percentage of transcripts mapped to mitochondrial genes per cell density (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Percentage of transcripts mapped to mitochondrial genes per cell density' + + fltr_nvlt_dnst_plot_png: + type: File? + outputSource: sc_rna_filter/fltr_nvlt_dnst_plot_png + label: "Novelty score per cell density (filtered)" + doc: | + Novelty score per cell density (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Novelty score per cell density' + + fltr_qc_mtrcs_dnst_plot_png: + type: File? + outputSource: sc_rna_filter/fltr_qc_mtrcs_dnst_plot_png + label: "QC metrics per cell density (filtered)" + doc: | + QC metrics per cell density (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'QC metrics per cell density' + + fltr_umi_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_rna_filter/fltr_umi_dnst_spl_cnd_plot_png + label: "Split by grouping condition UMI per cell density (filtered)" + doc: | + Split by grouping condition UMI per cell density (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Split by grouping condition UMI per cell density' + + fltr_gene_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_rna_filter/fltr_gene_dnst_spl_cnd_plot_png + label: "Split by grouping condition genes per cell density (filtered)" + doc: | + Split by grouping condition genes per cell density (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Split by grouping condition genes per cell density' + + fltr_mito_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_rna_filter/fltr_mito_dnst_spl_cnd_plot_png + label: "Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density (filtered)" + doc: | + Split by grouping condition the percentage of transcripts mapped + to mitochondrial genes per cell density (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density' + + fltr_nvlt_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_rna_filter/fltr_nvlt_dnst_spl_cnd_plot_png + label: "Split by grouping condition the novelty score per cell density (filtered)" + doc: | + Split by grouping condition the novelty score per cell density (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Split by grouping condition the novelty score per cell density' + + ucsc_cb_config_data: + type: File + outputSource: compress_cellbrowser_config_data/compressed_folder + label: "Compressed directory with UCSC Cellbrowser configuration data" + doc: | + Compressed directory with UCSC Cellbrowser configuration data. + + ucsc_cb_html_data: + type: Directory + outputSource: sc_rna_filter/ucsc_cb_html_data + label: "Directory with UCSC Cellbrowser html data" + doc: | + Directory with UCSC Cellbrowser html data. + + ucsc_cb_html_file: + type: File + outputSource: sc_rna_filter/ucsc_cb_html_file + label: "Open in UCSC Cell Browser" + doc: | + HTML index file from the directory with UCSC Cellbrowser html data. + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + seurat_data_rds: + type: File + outputSource: sc_rna_filter/seurat_data_rds + label: "Processed Seurat data in RDS format" + doc: | + Processed Seurat data in RDS format + + sc_rna_filter_stdout_log: + type: File + outputSource: sc_rna_filter/stdout_log + label: "stdout log generated by sc_rna_filter step" + doc: | + stdout log generated by sc_rna_filter step + + sc_rna_filter_stderr_log: + type: File + outputSource: sc_rna_filter/stderr_log + label: "stderr log generated by sc_rna_filter step" + doc: | + stderr log generated by sc_rna_filter step + + +steps: + + uncompress_feature_bc_matrices: + doc: | + Extracts the content of TAR file into a folder + run: ../tools/tar-extract.cwl + in: + file_to_extract: filtered_feature_bc_matrix_folder + out: + - extracted_folder + + sc_rna_filter: + doc: | + Filters single-cell RNA-Seq datasets based on the common QC metrics + run: ../tools/sc-rna-filter.cwl + in: + feature_bc_matrices_folder: uncompress_feature_bc_matrices/extracted_folder + aggregation_metadata: aggregation_metadata + grouping_data: grouping_data + barcodes_data: barcodes_data + rna_minimum_cells: + default: 1 + minimum_genes: + source: minimum_genes + valueFrom: $(split_numbers(self)) + maximum_genes: + source: maximum_genes + valueFrom: $(split_numbers(self)) + rna_minimum_umi: + source: rna_minimum_umi + valueFrom: $(split_numbers(self)) + minimum_novelty_score: + source: minimum_novelty_score + valueFrom: $(split_numbers(self)) + mito_pattern: mito_pattern + maximum_mito_perc: maximum_mito_perc + verbose: + default: true + export_ucsc_cb: + default: true + color_theme: color_theme + parallel_memory_limit: + source: parallel_memory_limit + valueFrom: $(parseInt(self)) + vector_memory_limit: + source: vector_memory_limit + valueFrom: $(parseInt(self)) + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - raw_1_2_qc_mtrcs_pca_plot_png + - raw_2_3_qc_mtrcs_pca_plot_png + - raw_cells_count_plot_png + - raw_umi_dnst_plot_png + - raw_gene_dnst_plot_png + - raw_gene_umi_corr_plot_png + - raw_mito_dnst_plot_png + - raw_nvlt_dnst_plot_png + - raw_qc_mtrcs_dnst_plot_png + - raw_umi_dnst_spl_cnd_plot_png + - raw_gene_dnst_spl_cnd_plot_png + - raw_mito_dnst_spl_cnd_plot_png + - raw_nvlt_dnst_spl_cnd_plot_png + - fltr_1_2_qc_mtrcs_pca_plot_png + - fltr_2_3_qc_mtrcs_pca_plot_png + - fltr_cells_count_plot_png + - fltr_umi_dnst_plot_png + - fltr_gene_dnst_plot_png + - fltr_gene_umi_corr_plot_png + - fltr_mito_dnst_plot_png + - fltr_nvlt_dnst_plot_png + - fltr_qc_mtrcs_dnst_plot_png + - fltr_umi_dnst_spl_cnd_plot_png + - fltr_gene_dnst_spl_cnd_plot_png + - fltr_mito_dnst_spl_cnd_plot_png + - fltr_nvlt_dnst_spl_cnd_plot_png + - ucsc_cb_config_data + - ucsc_cb_html_data + - ucsc_cb_html_file + - seurat_data_rds + - stdout_log + - stderr_log + + compress_cellbrowser_config_data: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: sc_rna_filter/ucsc_cb_config_data + out: + - compressed_folder + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Single-cell RNA-Seq Filtering Analysis" +s:name: "Single-cell RNA-Seq Filtering Analysis" +s:alternateName: "Filters single-cell RNA-Seq datasets based on the common QC metrics" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-filter.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell RNA-Seq Filtering Analysis + + Filters single-cell RNA-Seq datasets based on the common QC metrics. \ No newline at end of file diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl new file mode 100644 index 00000000..2b5ce3c1 --- /dev/null +++ b/workflows/sc-rna-reduce.cwl @@ -0,0 +1,674 @@ +cwlVersion: v1.1 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var split_features = function(line) { + function get_unique(value, index, self) { + return self.indexOf(value) === index && value != ""; + } + let splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + + +'sd:upstream': + sc_tools_sample: + - "sc-atac-cluster.cwl" + - "sc-atac-reduce.cwl" + - "sc-rna-filter.cwl" + - "sc-multiome-filter.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name/alias" + sd:preview: + position: 1 + + query_data_rds: + type: File + label: "Experiment run through either Single-cell RNA-Seq or Multiome ATAC and RNA-Seq Filtering Analysis" + doc: | + Path to the RDS file to load Seurat object from. This file should include genes + expression information stored in the RNA assay. + 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" + 'sd:localLabel': true + + datasets_metadata: + type: File? + label: "Path to the TSV/CSV file to optionally extend Seurat object metadata with categorical values" + doc: | + Path to the TSV/CSV file to optionally extend Seurat object metadata with + categorical values using samples identities. First column - 'library_id' + should correspond to all unique values from the 'new.ident' column of the + loaded Seurat object. If any of the provided in this file columns are already + present in the Seurat object metadata, they will be overwritten. When combined + with --barcodes parameter, first the metadata will be extended, then barcode + filtering will be applied. + Default: no extra metadata is added + + barcodes_data: + type: File? + label: "Optional TSV/CSV file to prefilter and extend metadata be barcodes. First column should be named as 'barcode'" + doc: | + Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata be selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. + Default: all cells used, no extra metadata is added + + cell_cycle_data: + type: File? + label: "Optional TSV/CSV file with cell cycle data. First column - 'phase', second column 'gene_id'" + doc: | + Path to the TSV/CSV file with the information for cell cycle score assignment. + First column - 'phase', second column 'gene_id'. If loaded Seurat object already + includes cell cycle scores in 'S.Score', 'G2M.Score', and 'CC.Difference' metatada + columns they will be overwritten. + Default: skip cell cycle score assignment. + + dimensions: + type: int? + label: "Dimensionality to use in UMAP projection (from 1 to 50)" + default: 40 + doc: | + Dimensionality to use in UMAP projection (from 1 to 50). If single value N + is provided, use from 1 to N PCs. If multiple values are provided, subset to + only selected PCs. In combination with --ntgr set to harmony, selected principle + components will be used in Harmony integration. + Default: from 1 to 10 + + normalization_method: + type: + - "null" + - type: enum + symbols: + - "sct" + - "log" + - "sctglm" + label: "Normalization method applied to genes expression counts" + default: "sctglm" + doc: | + Normalization method applied to genes expression counts. If loaded Seurat object + includes multiple datasets, normalization will be run independently for each of + them, unless integration is disabled with 'none' or set to 'harmony' + Default: sct + 'sd:layout': + advanced: true + + integration_method: + type: + - "null" + - type: enum + symbols: + - "seurat" + - "harmony" + - "none" + label: "Integration method used for joint analysis of multiple datasets" + default: "seurat" + doc: | + Integration method used for joint analysis of multiple datasets. Automatically + set to 'none' if loaded Seurat object includes only one dataset. + Default: seurat + 'sd:layout': + advanced: true + + integrate_by: + type: string? + label: "Variable(s) to be integrated out when running multiple integration with Harmony" + default: "new.ident" + doc: | + Column(s) from the Seurat object metadata to define the variable(s) that should + be integrated out when running multiple datasets integration with harmony. May + include columns from the extra metadata added with --metadata parameter. Ignored + if --ntgr is not set to harmony. + Default: new.ident + 'sd:layout': + advanced: true + + highly_var_genes_count: + type: int? + label: "Number of highly variable genes used in datasets integration, scaling and dimensionality reduction" + default: 3000 + doc: | + Number of highly variable genes used in datasets integration, scaling and + dimensionality reduction. + Default: 3000 + 'sd:layout': + advanced: true + + regress_mito_perc: + type: boolean? + label: "Regress the percentage of transcripts mapped to mitochondrial genes as a confounding source of variation" + default: false + doc: | + Regress the percentage of transcripts mapped to mitochondrial genes as a + confounding source of variation. + Default: false + 'sd:layout': + advanced: true + + regress_genes: + type: string? + label: "Regress genes per cell counts as a confounding source of variation" + default: null + doc: | + Genes which expression should be regressed as a confounding source of variation. + Default: None + 'sd:layout': + advanced: true + + regress_cellcycle: + type: + - "null" + - type: enum + symbols: + - "completely" + - "partialy" + - "none" + label: "Regress cell cycle scores as a confounding source of variation" + default: "none" + doc: | + "completely" - regress all signals associated with cell cycle phase. + "partialy" - regress only differences in cell cycle phase among + proliferating cells, signals separating non-cycling and cycling cells + will be maintained. + "none" - do not regress signals associated with cell cycle phase + Default: "none" + 'sd:layout': + advanced: true + + umap_spread: + type: float? + label: "UMAP Spread - the effective scale of embedded points (determines how clustered/clumped the embedded points are)" + default: 1 + doc: | + The effective scale of embedded points on UMAP. In combination with '--mindist' + it determines how clustered/clumped the embedded points are. + Default: 1 + 'sd:layout': + advanced: true + + umap_mindist: + type: float? + label: "UMAP Min. Dist. - controls how tightly the embedding is allowed compress points together" + default: 0.3 + doc: | + Controls how tightly the embedding is allowed compress points together on UMAP. + Larger values ensure embedded points are moreevenly distributed, while smaller + values allow the algorithm to optimise more accurately with regard to local structure. + Sensible values are in the range 0.001 to 0.5. + Default: 0.3 + 'sd:layout': + advanced: true + + umap_neighbors: + type: int? + label: "UMAP Neighbors Number - determines the number of neighboring points used" + default: 30 + doc: | + Determines the number of neighboring points used in UMAP. Larger values will result + in more global structure being preserved at the loss of detailed local structure. + In general this parameter should often be in the range 5 to 50. + Default: 30 + 'sd:layout': + advanced: true + + umap_metric: + type: + - "null" + - type: enum + symbols: + - "euclidean" + - "cosine" + - "correlation" + label: "UMAP Dist. Metric - the metric to use to compute distances in high dimensional space" + default: "cosine" + doc: | + The metric to use to compute distances in high dimensional space for UMAP. + Default: cosine + 'sd:layout': + advanced: true + + umap_method: + type: + - "null" + - type: enum + symbols: + - "uwot" + - "uwot-learn" + - "umap-learn" + label: "UMAP implementation to run (if set to 'umap-learn' use 'correlation' distance metric)" + default: "uwot" + doc: | + UMAP implementation to run. If set to 'umap-learn' use --umetric 'correlation' + Default: uwot + 'sd:layout': + advanced: true + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + default: "classic" + label: "Color theme for all generated plots" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + 'sd:layout': + advanced: true + + parallel_memory_limit: + type: + - "null" + - type: enum + symbols: + - "32" + default: "32" + label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Forced to 32 GB + 'sd:layout': + advanced: true + + vector_memory_limit: + type: + - "null" + - type: enum + symbols: + - "96" + default: "96" + label: "Maximum vector memory in GB allowed to be used by R" + doc: | + Maximum vector memory in GB allowed to be used by R. + Forced to 96 GB + 'sd:layout': + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + default: "1" + label: "Number of cores/cpus to use" + doc: | + Number of cores/cpus to use + Forced to 1 + 'sd:layout': + advanced: true + + +outputs: + + elbow_plot_png: + type: File? + outputSource: sc_rna_reduce/elbow_plot_png + label: "Elbow plot (from cells PCA)" + doc: | + Elbow plot (from cells PCA). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Elbow plot (from cells PCA)' + + qc_dim_corr_plot_png: + type: File? + outputSource: sc_rna_reduce/qc_dim_corr_plot_png + label: "Correlation plots between QC metrics and cells PCA components" + doc: | + Correlation plots between QC metrics and cells PCA components. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Correlation plots between QC metrics and cells PCA components' + + umap_qc_mtrcs_plot_png: + type: File? + outputSource: sc_rna_reduce/umap_qc_mtrcs_plot_png + label: "QC metrics on cells UMAP" + doc: | + QC metrics on cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'QC metrics on cells UMAP' + + umap_plot_png: + type: File? + outputSource: sc_rna_reduce/umap_plot_png + label: "Cells UMAP" + doc: | + Cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Cells UMAP' + + ccpca_plot_png: + type: File? + outputSource: sc_rna_reduce/ccpca_plot_png + label: "Cells PCA using only cell cycle genes" + doc: | + Cells PCA using only cell cycle genes. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Cells PCA using only cell cycle genes' + + umap_spl_ph_plot_png: + type: File? + outputSource: sc_rna_reduce/umap_spl_ph_plot_png + label: "Split by cell cycle phase cells UMAP" + doc: | + Split by cell cycle phase cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by cell cycle phase cells UMAP' + + umap_spl_mito_plot_png: + type: File? + outputSource: sc_rna_reduce/umap_spl_mito_plot_png + label: "Split by the percentage of transcripts mapped to mitochondrial genes cells UMAP" + doc: | + Split by the percentage of transcripts mapped to mitochondrial genes cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by the percentage of transcripts mapped to mitochondrial genes cells UMAP' + + umap_spl_umi_plot_png: + type: File? + outputSource: sc_rna_reduce/umap_spl_umi_plot_png + label: "Split by the UMI per cell counts cells UMAP" + doc: | + Split by the UMI per cell counts cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by the UMI per cell counts cells UMAP' + + umap_spl_gene_plot_png: + type: File? + outputSource: sc_rna_reduce/umap_spl_gene_plot_png + label: "Split by the genes per cell counts cells UMAP" + doc: | + Split by the genes per cell counts cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by the genes per cell counts cells UMAP' + + umap_spl_idnt_plot_png: + type: File? + outputSource: sc_rna_reduce/umap_spl_idnt_plot_png + label: "Split by dataset cells UMAP" + doc: | + Split by dataset cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by dataset cells UMAP' + + ccpca_spl_idnt_plot_png: + type: File? + outputSource: sc_rna_reduce/ccpca_spl_idnt_plot_png + label: "Split by dataset cells PCA using only cell cycle genes" + doc: | + Split by dataset cells PCA using only cell cycle genes. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by dataset cells PCA using only cell cycle genes' + + umap_spl_cnd_plot_png: + type: File? + outputSource: sc_rna_reduce/umap_spl_cnd_plot_png + label: "Split by grouping condition cells UMAP" + doc: | + Split by grouping condition cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Split by grouping condition cells UMAP' + + umap_gr_cnd_spl_ph_plot_png: + type: File? + outputSource: sc_rna_reduce/umap_gr_cnd_spl_ph_plot_png + label: "Grouped by condition split by cell cycle cells UMAP" + doc: | + Grouped by condition split by cell cycle cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Grouped by condition split by cell cycle cells UMAP' + + ccpca_spl_cnd_plot_png: + type: File? + outputSource: sc_rna_reduce/ccpca_spl_cnd_plot_png + label: "Split by grouping condition cells PCA using only cell cycle genes" + doc: | + Split by grouping condition cells PCA using only cell cycle genes. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Split by grouping condition cells PCA using only cell cycle genes' + + umap_gr_cnd_spl_mito_plot_png: + type: File? + outputSource: sc_rna_reduce/umap_gr_cnd_spl_mito_plot_png + label: "Grouped by condition split by the percentage of transcripts mapped to mitochondrial genes cells UMAP" + doc: | + Grouped by condition split by the percentage of transcripts mapped to mitochondrial genes cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Grouped by condition split by the percentage of transcripts mapped to mitochondrial genes cells UMAP' + + umap_gr_cnd_spl_umi_plot_png: + type: File? + outputSource: sc_rna_reduce/umap_gr_cnd_spl_umi_plot_png + label: "Grouped by condition split by the UMI per cell counts cells UMAP" + doc: | + Grouped by condition split by the UMI per cell counts cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Grouped by condition split by the UMI per cell counts cells UMAP' + + umap_gr_cnd_spl_gene_plot_png: + type: File? + outputSource: sc_rna_reduce/umap_gr_cnd_spl_gene_plot_png + label: "Grouped by condition split by the genes per cell counts cells UMAP" + doc: | + Grouped by condition split by the genes per cell counts cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Grouped by condition split by the genes per cell counts cells UMAP' + + seurat_data_rds: + type: File + outputSource: sc_rna_reduce/seurat_data_rds + label: "Processed Seurat data in RDS format" + doc: | + Processed Seurat data in RDS format + + sc_rna_reduce_stdout_log: + type: File + outputSource: sc_rna_reduce/stdout_log + label: "stdout log generated by sc_rna_reduce step" + doc: | + stdout log generated by sc_rna_reduce step + + sc_rna_reduce_stderr_log: + type: File + outputSource: sc_rna_reduce/stderr_log + label: "stderr log generated by sc_rna_reduce step" + doc: | + stderr log generated by sc_rna_reduce step + + +steps: + + sc_rna_reduce: + doc: | + Integrates multiple single-cell RNA-Seq datasets, + reduces dimensionality using PCA + run: ../tools/sc-rna-reduce.cwl + in: + query_data_rds: query_data_rds + barcodes_data: barcodes_data + cell_cycle_data: cell_cycle_data + datasets_metadata: datasets_metadata + normalization_method: normalization_method + integration_method: integration_method + integrate_by: + source: integrate_by + valueFrom: $(split_features(self)) + highly_var_genes_count: highly_var_genes_count + regress_mito_perc: regress_mito_perc + regress_genes: + source: regress_genes + valueFrom: $(split_features(self)) + regress_ccycle_full: + source: regress_cellcycle + valueFrom: $(self=="completely"?true:null) + regress_ccycle_diff: + source: regress_cellcycle + valueFrom: $(self=="partialy"?true:null) + dimensions: dimensions + umap_spread: umap_spread + umap_mindist: umap_mindist + umap_neighbors: umap_neighbors + umap_metric: umap_metric + umap_method: umap_method + verbose: + default: true + export_ucsc_cb: + default: false + low_memory: + default: true + color_theme: color_theme + parallel_memory_limit: + source: parallel_memory_limit + valueFrom: $(parseInt(self)) + vector_memory_limit: + source: vector_memory_limit + valueFrom: $(parseInt(self)) + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - elbow_plot_png + - qc_dim_corr_plot_png + - umap_qc_mtrcs_plot_png + - umap_plot_png + - umap_spl_ph_plot_png + - ccpca_plot_png + - umap_spl_mito_plot_png + - umap_spl_umi_plot_png + - umap_spl_gene_plot_png + - umap_spl_idnt_plot_png + - ccpca_spl_idnt_plot_png + - umap_spl_cnd_plot_png + - umap_gr_cnd_spl_ph_plot_png + - ccpca_spl_cnd_plot_png + - umap_gr_cnd_spl_mito_plot_png + - umap_gr_cnd_spl_umi_plot_png + - umap_gr_cnd_spl_gene_plot_png + - seurat_data_rds + - stdout_log + - stderr_log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Single-cell RNA-Seq Dimensionality Reduction Analysis" +s:name: "Single-cell RNA-Seq Dimensionality Reduction Analysis" +s:alternateName: "Integrates multiple single-cell RNA-Seq datasets, reduces dimensionality using PCA" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-reduce.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell RNA-Seq Dimensionality Reduction Analysis + + Integrates multiple single-cell RNA-Seq datasets, reduces dimensionality using PCA. \ No newline at end of file diff --git a/workflows/sc-triangulate.cwl b/workflows/sc-triangulate.cwl new file mode 100644 index 00000000..21b93b73 --- /dev/null +++ b/workflows/sc-triangulate.cwl @@ -0,0 +1,403 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var split_numbers = function(line) { + let splitted_line = line?line.split(/[\s,]+/).map(parseFloat):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + - var get_query_column = function(reduction, resolution) { + if (reduction=="RNA") { + return Array.from(split_numbers(resolution), r => "rna_res." + r); + } else if (reduction=="ATAC") { + return Array.from(split_numbers(resolution), r => "atac_res." + r); + } else if (reduction=="WNN") { + return Array.from(split_numbers(resolution), r => "wsnn_res." + r); + } + }; + + +'sd:upstream': + sc_tools_sample: + - "sc-rna-cluster.cwl" + - "sc-atac-cluster.cwl" + - "sc-wnn-cluster.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name/alias" + sd:preview: + position: 1 + + query_data_rds: + type: File + label: "Experiment run through any of the Single-cell Cluster Analysis" + doc: | + Path to the RDS file to load Seurat object from. This file should include + genes expression and/or chromatin accessibility information stored in the RNA + and/or ATAC assays correspondingly. Additionally, 'rnaumap', and/or 'atacumap', + and/or 'wnnumap' dimensionality reductions should be present. + 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" + 'sd:localLabel': true + + query_reduction: + type: + - "null" + - type: enum + symbols: + - "RNA" + - "ATAC" + - "WNN" + default: "RNA" + label: "Select clusters based on" + doc: | + If set to 'RNA' selects query_source_column with 'rna_res' prefix. + If set to 'ATAC' selects query_source_column with 'atac_res' prefix. + If set to 'WNN' selects query_source_column with 'wsnn_res' prefix. + + query_resolution: + type: string + label: "Comma or space separated list of clustering resolutions to harmonize" + doc: | + Defines the suffix used when constructing values for 'query_source_column' + + barcodes_data: + type: File? + label: "Optional TSV/CSV file to prefilter and extend metadata be barcodes. First column should be named as 'barcode'" + doc: | + Path to the TSV/CSV file to optionally prefilter and extend Seurat object + metadata be selected barcodes. First column should be named as 'barcode'. + If file includes any other columns they will be added to the Seurat object + metadata ovewriting the existing ones if those are present. + Default: all cells used, no extra metadata is added + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + default: "classic" + label: "Color theme for all generated plots" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + 'sd:layout': + advanced: true + + parallel_memory_limit: + type: + - "null" + - type: enum + symbols: + - "32" + default: "32" + label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Forced to 32 GB + 'sd:layout': + advanced: true + + vector_memory_limit: + type: + - "null" + - type: enum + symbols: + - "64" + default: "64" + label: "Maximum vector memory in GB allowed to be used by R" + doc: | + Maximum vector memory in GB allowed to be used by R. + Forced to 64 GB + 'sd:layout': + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + default: "1" + label: "Number of cores/cpus to use" + doc: | + Number of cores/cpus to use + Forced to 1 + 'sd:layout': + advanced: true + + +outputs: + + umap_tril_rd_rnaumap_plot_png: + type: File? + outputSource: triangulate/umap_tril_rd_rnaumap_plot_png + label: "Cells UMAP with integrated labels (rnaumap dim. reduction)" + doc: | + Cells UMAP with integrated labels (rnaumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'RNA' + Caption: 'Cells UMAP with integrated labels' + + umap_tric_rd_rnaumap_plot_png: + type: File? + outputSource: triangulate/umap_tric_rd_rnaumap_plot_png + label: "Cells UMAP with integration confidence scores (rnaumap dim. reduction)" + doc: | + Cells UMAP with integration confidence scores (rnaumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'RNA' + Caption: 'Cells UMAP with integration confidence scores' + + umap_tria_rd_rnaumap_plot_png: + type: File? + outputSource: triangulate/umap_tria_rd_rnaumap_plot_png + label: "Cells UMAP with winning annotations (rnaumap dim. reduction)" + doc: | + Cells UMAP with winning annotations (rnaumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'RNA' + Caption: 'Cells UMAP with winning annotations' + + umap_tril_rd_atacumap_plot_png: + type: File? + outputSource: triangulate/umap_tril_rd_atacumap_plot_png + label: "Cells UMAP with integrated labels (atacumap dim. reduction)" + doc: | + Cells UMAP with integrated labels (atacumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'ATAC' + Caption: 'Cells UMAP with integrated labels' + + umap_tric_rd_atacumap_plot_png: + type: File? + outputSource: triangulate/umap_tric_rd_atacumap_plot_png + label: "Cells UMAP with integration confidence scores (atacumap dim. reduction)" + doc: | + Cells UMAP with integration confidence scores (atacumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'ATAC' + Caption: 'Cells UMAP with integration confidence scores' + + umap_tria_rd_atacumap_plot_png: + type: File? + outputSource: triangulate/umap_tria_rd_atacumap_plot_png + label: "Cells UMAP with winning annotations (atacumap dim. reduction)" + doc: | + Cells UMAP with winning annotations (atacumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'ATAC' + Caption: 'Cells UMAP with winning annotations' + + umap_tril_rd_wnnumap_plot_png: + type: File? + outputSource: triangulate/umap_tril_rd_wnnumap_plot_png + label: "Cells UMAP with integrated labels (wnnumap dim. reduction)" + doc: | + Cells UMAP with integrated labels (wnnumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'WNN' + Caption: 'Cells UMAP with integrated labels' + + umap_tric_rd_wnnumap_plot_png: + type: File? + outputSource: triangulate/umap_tric_rd_wnnumap_plot_png + label: "Cells UMAP with integration confidence scores (wnnumap dim. reduction)" + doc: | + Cells UMAP with integration confidence scores (wnnumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'WNN' + Caption: 'Cells UMAP with integration confidence scores' + + umap_tria_rd_wnnumap_plot_png: + type: File? + outputSource: triangulate/umap_tria_rd_wnnumap_plot_png + label: "Cells UMAP with winning annotations (wnnumap dim. reduction)" + doc: | + Cells UMAP with winning annotations (wnnumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'WNN' + Caption: 'Cells UMAP with winning annotations' + + ucsc_cb_config_data: + type: File + outputSource: compress_cellbrowser_config_data/compressed_folder + label: "Compressed directory with UCSC Cellbrowser configuration data" + doc: | + Compressed directory with UCSC Cellbrowser configuration data. + + ucsc_cb_html_data: + type: Directory + outputSource: triangulate/ucsc_cb_html_data + label: "Directory with UCSC Cellbrowser html data" + doc: | + Directory with UCSC Cellbrowser html data. + + ucsc_cb_html_file: + type: File + outputSource: triangulate/ucsc_cb_html_file + label: "Open in UCSC Cell Browser" + doc: | + HTML index file from the directory with UCSC Cellbrowser html data. + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + seurat_data_rds: + type: File + outputSource: triangulate/seurat_data_rds + label: "Processed Seurat data in RDS format" + doc: | + Processed Seurat data in RDS format + + triangulate_stdout_log: + type: File + outputSource: triangulate/stdout_log + label: "stdout log generated by triangulate step" + doc: | + stdout log generated by triangulate step + + triangulate_stderr_log: + type: File + outputSource: triangulate/stderr_log + label: "stderr log generated by triangulate step" + doc: | + stderr log generated by triangulate step + + +steps: + + triangulate: + run: ../tools/sc-triangulate.cwl + in: + query_data_rds: query_data_rds + barcodes_data: barcodes_data + query_source_column: + source: [query_reduction, query_resolution] + valueFrom: $(get_query_column(self[0], self[1])) + verbose: + default: true + export_ucsc_cb: + default: true + color_theme: color_theme + parallel_memory_limit: + source: parallel_memory_limit + valueFrom: $(parseInt(self)) + vector_memory_limit: + source: vector_memory_limit + valueFrom: $(parseInt(self)) + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - umap_tril_rd_rnaumap_plot_png + - umap_tril_rd_atacumap_plot_png + - umap_tril_rd_wnnumap_plot_png + - umap_tria_rd_rnaumap_plot_png + - umap_tria_rd_atacumap_plot_png + - umap_tria_rd_wnnumap_plot_png + - umap_tric_rd_rnaumap_plot_png + - umap_tric_rd_atacumap_plot_png + - umap_tric_rd_wnnumap_plot_png + - ucsc_cb_config_data + - ucsc_cb_html_data + - ucsc_cb_html_file + - seurat_data_rds + - stdout_log + - stderr_log + + compress_cellbrowser_config_data: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: triangulate/ucsc_cb_config_data + out: + - compressed_folder + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Single-cell Label Integration Analysis" +s:name: "Single-cell Label Integration Analysis" +s:alternateName: "Harmonizes conflicting annotations in single-cell genomics studies" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-triangulate.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell Label Integration Analysis + + Harmonizes conflicting annotations in single-cell genomics studies. \ No newline at end of file diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl new file mode 100644 index 00000000..76f2c2fb --- /dev/null +++ b/workflows/sc-wnn-cluster.cwl @@ -0,0 +1,770 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var split_features = function(line) { + function get_unique(value, index, self) { + return self.indexOf(value) === index && value != ""; + } + let splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + - var split_numbers = function(line) { + let splitted_line = line?line.split(/[\s,]+/).map(parseFloat):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + + +'sd:upstream': + sc_tools_sample: + - "sc-wnn-cluster.cwl" + - "sc-rna-cluster.cwl" + - "sc-atac-cluster.cwl" + - "sc-rna-reduce.cwl" + - "sc-atac-reduce.cwl" + sc_arc_sample: + - "cellranger-arc-count.cwl" + - "cellranger-arc-aggr.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name/alias" + sd:preview: + position: 1 + + query_data_rds: + type: File + label: "Experiment run through both Single-cell RNA-Seq and ATAC-Seq Dimensionality Reduction Analyses" + doc: | + Path to the RDS file to load Seurat object from. This file should include + genes expression and chromatin accessibility information stored in the RNA + and ATAC assays correspondingly. Additionally, 'pca', 'rnaumap', 'atac_lsi' + and 'atacumap' dimensionality reductions should be present. + 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" + 'sd:localLabel': true + + rna_dimensions: + type: int? + default: 40 + label: "Dimensionality from the 'pca' reduction to use when constructing weighted nearest-neighbor graph before clustering (from 1 to 50)" + doc: | + Dimensionality from the 'pca' reduction to use when constructing weighted + nearest-neighbor graph before clustering (from 1 to 50). If single value N + is provided, use from 1 to N dimensions. If multiple values are provided, + subset to only selected dimensions. + Default: from 1 to 10 + + atac_dimensions: + type: int? + default: 40 + label: "Dimensionality from the 'atac_lsi' reduction to use when constructing weighted nearest-neighbor graph before clustering (from 1 to 50)" + doc: | + Dimensionality from the 'atac_lsi' reduction to use when constructing weighted + nearest-neighbor graph before clustering (from 1 to 50). If single value N + is provided, use from 2 to N dimensions. If multiple values are provided, + subset to only selected dimensions. + Default: from 2 to 10 + + cluster_algorithm: + type: + - "null" + - type: enum + symbols: + - "louvain" + - "mult-louvain" + - "slm" + - "leiden" + default: "slm" + label: "Algorithm for modularity optimization when running clustering" + doc: | + Algorithm for modularity optimization when running clustering. + Default: slm + + resolution: + type: float? + default: 0.3 + label: "Clustering resolution" + doc: | + Clustering resolution applied to the constructed weighted nearest-neighbor + graph. Can be set as an array but only the first item from the list will + be used for cluster labels and gene/peak markers in the UCSC Cell Browser + when running with --cbbuild and --diffgenes/--diffpeaks parameters. + Default: 0.3, 0.5, 1.0 + + atac_fragments_file: + type: File? + secondaryFiles: + - .tbi + label: "Cell Ranger ARC Count/Aggregate Experiment" + doc: | + Count and barcode information for every ATAC fragment used in the loaded Seurat + object. File should be saved in TSV format with tbi-index file. + 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" + 'sd:localLabel': true + + genes_of_interest: + type: string? + default: null + label: "Genes of interest to build gene expression and Tn5 insertion frequency plots" + doc: | + Genes of interest to build gene expression and Tn5 insertion frequency plots + for the nearest peaks. If '--fragments' is not provided only gene expression + plots will be built. + Default: None + + identify_diff_genes: + type: boolean? + default: false + label: "Identify differentially expressed genes (putative gene markers) between each pair of clusters" + doc: | + Identify differentially expressed genes (putative gene markers) between each + pair of clusters for all resolutions. + Default: false + 'sd:layout': + advanced: true + + identify_diff_peaks: + type: boolean? + default: false + label: "Identify differentially accessible peaks between each pair of clusters" + doc: | + Identify differentially accessible peaks between each pair of clusters for all resolutions. + Default: false + 'sd:layout': + advanced: true + + rna_minimum_logfc: + type: float? + default: 0.25 + label: "Include only those genes that on average have log fold change difference in expression between every tested pair of clusters not lower than this value" + doc: | + For putative gene markers identification include only those genes that + on average have log fold change difference in expression between every + tested pair of clusters not lower than this value. Ignored if '--diffgenes' + is not set. + Default: 0.25 + 'sd:layout': + advanced: true + + rna_minimum_pct: + type: float? + default: 0.1 + label: "Include only those genes that are detected in not lower than this fraction of cells in either of the two tested clusters" + doc: | + For putative gene markers identification include only those genes that + are detected in not lower than this fraction of cells in either of the + two tested clusters. Ignored if '--diffgenes' is not set. + Default: 0.1 + 'sd:layout': + advanced: true + + atac_minimum_logfc: + type: float? + default: 0.25 + label: "Include only those peaks that on average have log fold change difference in the chromatin accessibility between every tested pair of clusters not lower than this value" + doc: | + For differentially accessible peaks identification include only those peaks that + on average have log fold change difference in the chromatin accessibility between + every tested pair of clusters not lower than this value. Ignored if '--diffpeaks' + is not set. + Default: 0.25 + 'sd:layout': + advanced: true + + atac_minimum_pct: + type: float? + default: 0.05 + label: "Include only those peaks that are detected in not lower than this fraction of cells in either of the two tested clusters" + doc: | + For differentially accessible peaks identification include only those peaks that + are detected in not lower than this fraction of cells in either of the two tested + clusters. Ignored if '--diffpeaks' is not set. + Default: 0.05 + 'sd:layout': + advanced: true + + umap_spread: + type: float? + label: "UMAP Spread - the effective scale of embedded points (determines how clustered/clumped the embedded points are)" + default: 1 + doc: | + The effective scale of embedded points on UMAP. In combination with '--mindist' + it determines how clustered/clumped the embedded points are. + Default: 1 + 'sd:layout': + advanced: true + + umap_mindist: + type: float? + label: "UMAP Min. Dist. - controls how tightly the embedding is allowed compress points together" + default: 0.3 + doc: | + Controls how tightly the embedding is allowed compress points together on UMAP. + Larger values ensure embedded points are moreevenly distributed, while smaller + values allow the algorithm to optimise more accurately with regard to local structure. + Sensible values are in the range 0.001 to 0.5. + Default: 0.3 + 'sd:layout': + advanced: true + + umap_neighbors: + type: int? + label: "UMAP Neighbors Number - determines the number of neighboring points used" + default: 30 + doc: | + Determines the number of neighboring points used in UMAP. Larger values will result + in more global structure being preserved at the loss of detailed local structure. + In general this parameter should often be in the range 5 to 50. + Default: 30 + 'sd:layout': + advanced: true + + umap_metric: + type: + - "null" + - type: enum + symbols: + - "euclidean" + - "cosine" + - "correlation" + label: "UMAP Dist. Metric - the metric to use to compute distances in high dimensional space" + default: "cosine" + doc: | + The metric to use to compute distances in high dimensional space for UMAP. + Default: cosine + 'sd:layout': + advanced: true + + umap_method: + type: + - "null" + - type: enum + symbols: + - "uwot" + - "uwot-learn" + - "umap-learn" + label: "UMAP implementation to run (if set to 'umap-learn' use 'correlation' distance metric)" + default: "uwot" + doc: | + UMAP implementation to run. If set to 'umap-learn' use --umetric 'correlation' + Default: uwot + 'sd:layout': + advanced: true + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + default: "classic" + label: "Color theme for all generated plots" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + 'sd:layout': + advanced: true + + parallel_memory_limit: + type: + - "null" + - type: enum + symbols: + - "32" + default: "32" + label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Forced to 32 GB + 'sd:layout': + advanced: true + + vector_memory_limit: + type: + - "null" + - type: enum + symbols: + - "64" + default: "64" + label: "Maximum vector memory in GB allowed to be used by R" + doc: | + Maximum vector memory in GB allowed to be used by R. + Forced to 64 GB + 'sd:layout': + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + default: "1" + label: "Number of cores/cpus to use" + doc: | + Number of cores/cpus to use + Forced to 1 + 'sd:layout': + advanced: true + + +outputs: + + umap_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/umap_res_plot_png + label: "Clustered cells UMAP" + doc: | + Clustered cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Clustered cells UMAP' + + umap_spl_idnt_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/umap_spl_idnt_res_plot_png + label: "Split by dataset clustered cells UMAP" + doc: | + Split by dataset clustered cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by dataset clustered cells UMAP' + + cmp_gr_clst_spl_idnt_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/cmp_gr_clst_spl_idnt_res_plot_png + label: "Grouped by cluster split by dataset cells composition plot. Downsampled." + doc: | + Grouped by cluster split by dataset cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Grouped by cluster split by dataset cells composition plot. Downsampled.' + + cmp_gr_idnt_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/cmp_gr_idnt_spl_clst_res_plot_png + label: "Grouped by dataset split by cluster cells composition plot. Downsampled." + doc: | + Grouped by dataset split by cluster cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Grouped by dataset split by cluster cells composition plot. Downsampled.' + + umap_spl_cnd_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/umap_spl_cnd_res_plot_png + label: "Split by grouping condition clustered cells UMAP" + doc: | + Split by grouping condition clustered cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Split by grouping condition clustered cells UMAP' + + cmp_gr_clst_spl_cnd_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/cmp_gr_clst_spl_cnd_res_plot_png + label: "Grouped by cluster split by condition cells composition plot. Downsampled." + doc: | + Grouped by cluster split by condition cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Grouped by cluster split by condition cells composition plot. Downsampled.' + + cmp_gr_cnd_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/cmp_gr_cnd_spl_clst_res_plot_png + label: "Grouped by condition split by cluster cells composition plot. Downsampled." + doc: | + Grouped by condition split by cluster cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Grouped by condition split by cluster cells composition plot. Downsampled.' + + umap_spl_ph_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/umap_spl_ph_res_plot_png + label: "Split by cell cycle phase clustered cells UMAP" + doc: | + Split by cell cycle phase clustered cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by cell cycle phase clustered cells UMAP' + + cmp_gr_ph_spl_idnt_plot_png: + type: File? + outputSource: sc_wnn_cluster/cmp_gr_ph_spl_idnt_plot_png + label: "Grouped by cell cycle phase split by dataset cells composition plot. Downsampled." + doc: | + Grouped by cell cycle phase split by dataset cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Grouped by cell cycle phase split by dataset cells composition plot. Downsampled.' + + cmp_gr_ph_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/cmp_gr_ph_spl_clst_res_plot_png + label: "Grouped by cell cycle phase split by cluster cells composition plot. Downsampled." + doc: | + Grouped by cell cycle phase split by cluster cells composition plot. Downsampled. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Grouped by cell cycle phase split by cluster cells composition plot. Downsampled.' + + xpr_avg_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/xpr_avg_res_plot_png + label: "Log normalized scaled average gene expression per cluster" + doc: | + Log normalized scaled average gene expression per cluster. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized scaled average gene expression per cluster' + + xpr_per_cell_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/xpr_per_cell_plot_png + label: "Log normalized gene expression on cells UMAP" + doc: | + Log normalized gene expression on cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized gene expression on cells UMAP' + + xpr_per_cell_sgnl_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/xpr_per_cell_sgnl_plot_png + label: "Log normalized gene expression density on cells UMAP" + doc: | + Log normalized gene expression density on cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized gene expression density on cells UMAP' + + xpr_dnst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/xpr_dnst_res_plot_png + label: "Log normalized gene expression density per cluster" + doc: | + Log normalized gene expression density per cluster. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Log normalized gene expression density per cluster' + + cvrg_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/cvrg_res_plot_png + label: "Tn5 insertion frequency plot around gene" + doc: | + Tn5 insertion frequency plot around gene. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Genome coverage' + Caption: 'Tn5 insertion frequency plot around gene' + + xpr_htmp_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/xpr_htmp_res_plot_png + label: "Normalized gene expression heatmap grouped by cluster" + doc: | + Normalized gene expression heatmap grouped by cluster. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Normalized gene expression heatmap grouped by cluster' + + gene_markers_tsv: + type: File? + outputSource: sc_wnn_cluster/gene_markers_tsv + label: "Differentially expressed genes between each pair of clusters" + doc: | + Differentially expressed genes between each pair of clusters for all resolutions. + TSV format + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'Gene markers' + Title: 'Differentially expressed genes between each pair of clusters' + + peak_markers_tsv: + type: File? + outputSource: sc_wnn_cluster/peak_markers_tsv + label: "Differentially accessible peaks between each pair of clusters" + doc: | + Differentially accessible peaks between each pair of clusters for all resolutions. + TSV format + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'Diff. peaks' + Title: 'Differentially accessible peaks between each pair of clusters' + + ucsc_cb_config_data: + type: File + outputSource: compress_cellbrowser_config_data/compressed_folder + label: "Compressed directory with UCSC Cellbrowser configuration data" + doc: | + Compressed directory with UCSC Cellbrowser configuration data. + + ucsc_cb_html_data: + type: Directory + outputSource: sc_wnn_cluster/ucsc_cb_html_data + label: "Directory with UCSC Cellbrowser html data" + doc: | + Directory with UCSC Cellbrowser html data. + + ucsc_cb_html_file: + type: File + outputSource: sc_wnn_cluster/ucsc_cb_html_file + label: "Open in UCSC Cell Browser" + doc: | + HTML index file from the directory with UCSC Cellbrowser html data. + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + seurat_data_rds: + type: File + outputSource: sc_wnn_cluster/seurat_data_rds + label: "Processed Seurat data in RDS format" + doc: | + Processed Seurat data in RDS format + + sc_wnn_cluster_stdout_log: + type: File + outputSource: sc_wnn_cluster/stdout_log + label: "stdout log generated by sc_wnn_cluster step" + doc: | + stdout log generated by sc_wnn_cluster step + + sc_wnn_cluster_stderr_log: + type: File + outputSource: sc_wnn_cluster/stderr_log + label: "stderr log generated by sc_wnn_cluster step" + doc: | + stderr log generated by sc_wnn_cluster step + + +steps: + + sc_wnn_cluster: + doc: | + Clusters multiome ATAC and RNA-Seq datasets, identifies + gene markers and differentially accessible peaks + run: ../tools/sc-wnn-cluster.cwl + in: + query_data_rds: query_data_rds + rna_dimensions: rna_dimensions + atac_dimensions: atac_dimensions + cluster_algorithm: cluster_algorithm + resolution: resolution + atac_fragments_file: atac_fragments_file + genes_of_interest: + source: genes_of_interest + valueFrom: $(split_features(self)) + identify_diff_genes: identify_diff_genes + identify_diff_peaks: identify_diff_peaks + rna_minimum_logfc: rna_minimum_logfc + rna_minimum_pct: rna_minimum_pct + atac_minimum_logfc: atac_minimum_logfc + atac_minimum_pct: atac_minimum_pct + only_positive_diff_genes: + default: true + rna_test_to_use: + default: wilcox + atac_test_to_use: + default: LR + umap_spread: umap_spread + umap_mindist: umap_mindist + umap_neighbors: umap_neighbors + umap_metric: umap_metric + umap_method: umap_method + verbose: + default: true + export_ucsc_cb: + default: true + color_theme: color_theme + parallel_memory_limit: + source: parallel_memory_limit + valueFrom: $(parseInt(self)) + vector_memory_limit: + source: vector_memory_limit + valueFrom: $(parseInt(self)) + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - umap_res_plot_png + - umap_spl_idnt_res_plot_png + - cmp_gr_clst_spl_idnt_res_plot_png + - cmp_gr_idnt_spl_clst_res_plot_png + - umap_spl_cnd_res_plot_png + - cmp_gr_clst_spl_cnd_res_plot_png + - cmp_gr_cnd_spl_clst_res_plot_png + - umap_spl_ph_res_plot_png + - cmp_gr_ph_spl_idnt_plot_png + - cmp_gr_ph_spl_clst_res_plot_png + - xpr_avg_res_plot_png + - xpr_per_cell_plot_png + - xpr_per_cell_sgnl_plot_png + - xpr_dnst_res_plot_png + - cvrg_res_plot_png + - xpr_htmp_res_plot_png + - gene_markers_tsv + - peak_markers_tsv + - ucsc_cb_config_data + - ucsc_cb_html_data + - ucsc_cb_html_file + - seurat_data_rds + - stdout_log + - stderr_log + + compress_cellbrowser_config_data: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: sc_wnn_cluster/ucsc_cb_config_data + out: + - compressed_folder + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Single-cell WNN Cluster Analysis" +s:name: "Single-cell WNN Cluster Analysis" +s:alternateName: "Clusters multiome ATAC and RNA-Seq datasets, identifies gene markers and differentially accessible peaks" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-wnn-cluster.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell WNN Cluster Analysis + + Clusters multiome ATAC and RNA-Seq datasets, identifies gene markers + and differentially accessible peaks. \ No newline at end of file diff --git a/workflows/single-cell-preprocess-cellranger.cwl b/workflows/single-cell-preprocess-cellranger.cwl index fab75af9..990103d1 100644 --- a/workflows/single-cell-preprocess-cellranger.cwl +++ b/workflows/single-cell-preprocess-cellranger.cwl @@ -34,47 +34,62 @@ inputs: - File - type: array items: File - format: "http://edamontology.org/format_1930" - label: "FASTQ file R1 (optionally compressed)" - doc: "FASTQ file R1 (optionally compressed)" + label: "FASTQ file(s) R1 (optionally compressed)" + doc: "FASTQ file(s) R1 (optionally compressed)" fastq_file_r2: type: - File - type: array items: File - format: "http://edamontology.org/format_1930" - label: "FASTQ file R2 (optionally compressed)" - doc: "FASTQ file R2 (optionally compressed)" + label: "FASTQ file(s) R2 (optionally compressed)" + doc: "FASTQ file(s) R2 (optionally compressed)" + + r1_length: + type: int? + default: null + label: "Limit the length of the input R1 sequence" + doc: "Limit the length of the input R1 sequence" + 'sd:layout': + advanced: true + + r2_length: + type: int? + default: null + label: "Limit the length of the input R2 sequence" + doc: "Limit the length of the input R2 sequence" + 'sd:layout': + advanced: true expect_cells: type: int? - default: 3000 - label: "Expected number of recovered cells" - doc: "Expected number of recovered cells" + default: null + label: "Expected number of recovered cells. If not provided - use auto-estimated" + doc: "Expected number of recovered cells. If not provided - use auto-estimated" 'sd:layout': advanced: true - force_expect_cells: + force_cells: + type: int? + default: null + label: "Force pipeline to use this number of cells, bypassing the cell detection algorithm" + doc: "Force pipeline to use this number of cells, bypassing the cell detection algorithm" + 'sd:layout': + advanced: true + + exclude_introns: type: boolean? default: false - label: "Force pipeline to use the expected number of recovered cells" - doc: | - Force pipeline to use the expected number of recovered cell. - The value provided in expect_cells will be sent to Cell Ranger Count as --force-cells. - The latter will bypass the cell detection algorithm. Use this if the number of cells - estimated by Cell Ranger is not consistent with the barcode rank plot. + label: "Do not count intronic reads for whole transcriptome gene expression data" + doc: "Do not count intronic reads for whole transcriptome gene expression data" 'sd:layout': advanced: true - include_introns: + no_bam: type: boolean? - default: false - label: "Count reads mapping to intronic regions. For samples with a significant amount of pre-mRNA molecules, such as nuclei" - doc: | - Add this flag to count reads mapping to intronic regions. - This may improve sensitivity for samples with a significant - amount of pre-mRNA molecules, such as nuclei. + default: true + label: "Do not generate the BAM file" + doc: "Do not generate the BAM file" 'sd:layout': advanced: true @@ -141,7 +156,7 @@ outputs: Run summary metrics in CSV format possorted_genome_bam_bai: - type: File + type: File? outputSource: generate_counts_matrix/possorted_genome_bam_bai label: "Aligned to the genome indexed reads BAM+BAI files" doc: | @@ -202,15 +217,31 @@ outputs: doc: | Loupe Browser visualization and analysis file - collected_statistics: + collected_statistics_yaml: + type: File + outputSource: collect_statistics/collected_statistics_yaml + label: "Collected statistics in YAML format" + doc: "Collected statistics in YAML format" + + collected_statistics_md: type: File - outputSource: collect_statistics/collected_statistics + outputSource: collect_statistics/collected_statistics_md label: "Collected statistics in Markdown format" doc: "Collected statistics in Markdown format" 'sd:visualPlugins': - markdownView: tab: 'Overview' + collected_statistics_tsv: + type: File + outputSource: collect_statistics/collected_statistics_tsv + label: "Collected statistics in TSV format" + doc: "Collected statistics in TSV format" + 'sd:visualPlugins': + - tableView: + vertical: true + tab: 'Overview' + generate_counts_matrix_stdout_log: type: File outputSource: generate_counts_matrix/stdout_log @@ -256,18 +287,18 @@ steps: extract_fastq_r1: run: ../tools/extract-fastq.cwl in: - output_prefix: - default: "read_1" compressed_file: fastq_file_r1 + output_prefix: + default: "read_1" out: - fastq_file extract_fastq_r2: run: ../tools/extract-fastq.cwl in: - output_prefix: - default: "read_2" compressed_file: fastq_file_r2 + output_prefix: + default: "read_2" out: - fastq_file @@ -293,13 +324,12 @@ steps: fastq_file_r1: extract_fastq_r1/fastq_file fastq_file_r2: extract_fastq_r2/fastq_file indices_folder: indices_folder - expect_cells: - source: [expect_cells, force_expect_cells] - valueFrom: $(self[1]?null:self[0]) - force_cells: - source: [expect_cells, force_expect_cells] - valueFrom: $(self[1]?self[0]:null) - include_introns: include_introns + r1_length: r1_length + r2_length: r2_length + expect_cells: expect_cells + force_cells: force_cells + no_bam: no_bam + exclude_introns: exclude_introns threads: threads memory_limit: memory_limit virt_memory_limit: memory_limit @@ -339,45 +369,13 @@ steps: - compressed_folder collect_statistics: - run: - cwlVersion: v1.0 - class: CommandLineTool - hints: - - class: DockerRequirement - dockerPull: rackspacedot/python37 - inputs: - script: - type: string? - default: | - #!/usr/bin/env python3 - import sys, csv - with open(sys.argv[1], "r") as input_stream: - with open("collected_statistics.md", "w") as output_stream: - output_stream.write("### Cell Ranger Statistics\n") - keys, values = None, None - for i, row in enumerate(csv.reader(input_stream)): - if i==0: - keys = row - else: - values = row - for k,v in zip(keys, values): - output_stream.write("- "+k+": "+v+"\n") - inputBinding: - position: 5 - metrics_summary_report: - type: File - inputBinding: - position: 6 - outputs: - collected_statistics: - type: File - outputBinding: - glob: "*" - baseCommand: ["python3", "-c"] + run: ../tools/collect-stats-sc-count.cwl in: metrics_summary_report: generate_counts_matrix/metrics_summary_report out: - - collected_statistics + - collected_statistics_yaml + - collected_statistics_tsv + - collected_statistics_md cellbrowser_build: run: ../tools/cellbrowser-build-cellranger.cwl @@ -443,4 +441,5 @@ s:creator: doc: | Cell Ranger Count Gene Expression - ================================= \ No newline at end of file + + Quantifies gene expression from a single-cell RNA-Seq library. \ No newline at end of file From 1e47e0dbe04dc0fa58af15c1f8e81c742bfe87c6 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 5 Apr 2023 15:37:48 -0400 Subject: [PATCH 017/162] Meaningless changes --- workflows/cellranger-mkref.cwl | 3 +-- workflows/cellranger-mkvdjref.cwl | 2 +- workflows/cellranger-multi.cwl | 2 +- workflows/cellranger-reanalyze.cwl | 3 +-- workflows/diffbind-multi-factor.cwl | 2 +- workflows/sc-atac-cluster.cwl | 3 +-- workflows/sc-atac-reduce.cwl | 3 +-- workflows/sc-ctype-assign.cwl | 2 +- workflows/sc-multiome-filter.cwl | 2 +- workflows/sc-rna-cluster.cwl | 2 +- workflows/sc-rna-da-cells.cwl | 3 +-- workflows/sc-rna-de-pseudobulk.cwl | 3 +-- workflows/sc-rna-filter.cwl | 2 +- workflows/sc-rna-reduce.cwl | 3 ++- workflows/sc-triangulate.cwl | 3 ++- workflows/sc-wnn-cluster.cwl | 4 ++-- workflows/single-cell-preprocess-cellranger.cwl | 2 +- 17 files changed, 20 insertions(+), 24 deletions(-) diff --git a/workflows/cellranger-mkref.cwl b/workflows/cellranger-mkref.cwl index 18489713..ac2587e1 100644 --- a/workflows/cellranger-mkref.cwl +++ b/workflows/cellranger-mkref.cwl @@ -226,5 +226,4 @@ s:creator: doc: | - Cell Ranger Build Reference Indices - =================================== \ No newline at end of file + Cell Ranger Build Reference Indices \ No newline at end of file diff --git a/workflows/cellranger-mkvdjref.cwl b/workflows/cellranger-mkvdjref.cwl index 159fa3ee..3513635d 100644 --- a/workflows/cellranger-mkvdjref.cwl +++ b/workflows/cellranger-mkvdjref.cwl @@ -139,6 +139,6 @@ s:creator: doc: | Cell Ranger Build V(D)J Reference Indices - + Build a Cell Ranger V(D)J-compatible reference folder from a user-supplied genome FASTA and gene GTF files. \ No newline at end of file diff --git a/workflows/cellranger-multi.cwl b/workflows/cellranger-multi.cwl index cc0d1790..dd7c2142 100644 --- a/workflows/cellranger-multi.cwl +++ b/workflows/cellranger-multi.cwl @@ -671,6 +671,6 @@ s:creator: doc: | Cell Ranger Multi Gene Expression and V(D)J Repertoire Profiling - + Quantifies gene expression and performs profiling of V(D)J repertoire from a single GEM well \ No newline at end of file diff --git a/workflows/cellranger-reanalyze.cwl b/workflows/cellranger-reanalyze.cwl index cb34623b..1d815d3e 100644 --- a/workflows/cellranger-reanalyze.cwl +++ b/workflows/cellranger-reanalyze.cwl @@ -613,5 +613,4 @@ s:creator: doc: | - Cellranger Reanalyze - ==================== \ No newline at end of file + Cellranger Reanalyze \ No newline at end of file diff --git a/workflows/diffbind-multi-factor.cwl b/workflows/diffbind-multi-factor.cwl index 407dd07d..166661c2 100644 --- a/workflows/diffbind-multi-factor.cwl +++ b/workflows/diffbind-multi-factor.cwl @@ -969,7 +969,7 @@ s:creator: doc: | DiffBind Multi-factor Analysis - ------------------------------ + DiffBind processes ChIP-Seq data enriched for genomic loci where specific protein/DNA binding occurs, including peak sets identified by ChIP-Seq peak callers and aligned sequence read datasets. It is designed to work with multiple peak sets simultaneously, representing different ChIP experiments (antibodies, transcription diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index cfa6ff67..b113cbdd 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -515,5 +515,4 @@ s:creator: doc: | Single-cell ATAC-Seq Cluster Analysis - Clusters single-cell ATAC-Seq datasets, identifies differentially - accessible peaks. \ No newline at end of file + Clusters single-cell ATAC-Seq datasets, identifies differentially accessible peaks. \ No newline at end of file diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index eec05b14..332f84a5 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -462,5 +462,4 @@ s:creator: doc: | Single-cell ATAC-Seq Dimensionality Reduction Analysis - Integrates multiple single-cell ATAC-Seq datasets, - reduces dimensionality using LSI. \ No newline at end of file + Integrates multiple single-cell ATAC-Seq datasets, reduces dimensionality using LSI. \ No newline at end of file diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index cc45dee6..db9a6ce7 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -819,5 +819,5 @@ s:creator: doc: | Single-cell Manual Cell Type Assignment - + Assigns cell types for clusters based on the provided metadata file. \ No newline at end of file diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index 8f1e5964..2914f9d2 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -1400,6 +1400,6 @@ s:creator: doc: | Single-cell Multiome ATAC and RNA-Seq Filtering Analysis - + Filters single-cell multiome ATAC and RNA-Seq datasets based on the common QC metrics. \ No newline at end of file diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index e57c0945..f8f9a10a 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -610,5 +610,5 @@ s:creator: doc: | Single-cell RNA-Seq Cluster Analysis - =============================================================== + Clusters single-cell RNA-Seq datasets, identifies gene markers. \ No newline at end of file diff --git a/workflows/sc-rna-da-cells.cwl b/workflows/sc-rna-da-cells.cwl index 753bef07..a4a8c20a 100644 --- a/workflows/sc-rna-da-cells.cwl +++ b/workflows/sc-rna-da-cells.cwl @@ -469,5 +469,4 @@ s:creator: doc: | Single-cell Differential Abundance Analysis - Detects cell subpopulations with differential abundance - between datasets split by biological condition. \ No newline at end of file + Detects cell subpopulations with differential abundance between datasets split by biological condition. \ No newline at end of file diff --git a/workflows/sc-rna-de-pseudobulk.cwl b/workflows/sc-rna-de-pseudobulk.cwl index ad588b5c..83454378 100644 --- a/workflows/sc-rna-de-pseudobulk.cwl +++ b/workflows/sc-rna-de-pseudobulk.cwl @@ -741,5 +741,4 @@ s:creator: doc: | Single-cell Pseudobulk Differential Expression Analysis Between Datasets - Identifies differentially expressed genes between groups of cells - coerced to pseudobulk datasets. \ No newline at end of file + Identifies differentially expressed genes between groups of cells coerced to pseudobulk datasets. \ No newline at end of file diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index be2e14cc..86be6008 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -708,5 +708,5 @@ s:creator: doc: | Single-cell RNA-Seq Filtering Analysis - + Filters single-cell RNA-Seq datasets based on the common QC metrics. \ No newline at end of file diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 2b5ce3c1..33fc5adb 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -671,4 +671,5 @@ s:creator: doc: | Single-cell RNA-Seq Dimensionality Reduction Analysis - Integrates multiple single-cell RNA-Seq datasets, reduces dimensionality using PCA. \ No newline at end of file + Integrates multiple single-cell RNA-Seq datasets, reduces + dimensionality using PCA. \ No newline at end of file diff --git a/workflows/sc-triangulate.cwl b/workflows/sc-triangulate.cwl index 21b93b73..f47c072a 100644 --- a/workflows/sc-triangulate.cwl +++ b/workflows/sc-triangulate.cwl @@ -400,4 +400,5 @@ s:creator: doc: | Single-cell Label Integration Analysis - Harmonizes conflicting annotations in single-cell genomics studies. \ No newline at end of file + Harmonizes conflicting annotations in single-cell + genomics studies. \ No newline at end of file diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index 76f2c2fb..50824cd1 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -766,5 +766,5 @@ s:creator: doc: | Single-cell WNN Cluster Analysis - Clusters multiome ATAC and RNA-Seq datasets, identifies gene markers - and differentially accessible peaks. \ No newline at end of file + Clusters multiome ATAC and RNA-Seq datasets, identifies gene + markers and differentially accessible peaks. \ No newline at end of file diff --git a/workflows/single-cell-preprocess-cellranger.cwl b/workflows/single-cell-preprocess-cellranger.cwl index 990103d1..a81326ee 100644 --- a/workflows/single-cell-preprocess-cellranger.cwl +++ b/workflows/single-cell-preprocess-cellranger.cwl @@ -441,5 +441,5 @@ s:creator: doc: | Cell Ranger Count Gene Expression - + Quantifies gene expression from a single-cell RNA-Seq library. \ No newline at end of file From c6497cee77a6b9ce588748966b5dd0011042e1ea Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 20 Apr 2023 16:29:54 -0400 Subject: [PATCH 018/162] Add samples order explanation to diffbind multi-factor analysis --- workflows/diffbind-multi-factor.cwl | 31 +++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/workflows/diffbind-multi-factor.cwl b/workflows/diffbind-multi-factor.cwl index d1b5dceb..397a2523 100644 --- a/workflows/diffbind-multi-factor.cwl +++ b/workflows/diffbind-multi-factor.cwl @@ -597,6 +597,16 @@ outputs: GCT file with normalized read counts per peak outputSource: extend_gct/extended_gct + experiment_info: + type: File + label: "Samples order for IGV" + doc: | + Markdown file to explain the sample order for IGV + outputSource: create_metadata/output_file + 'sd:visualPlugins': + - markdownView: + tab: 'Overview' + diffbind_stdout_log: type: File label: "DiffBind stdout log" @@ -919,6 +929,27 @@ steps: - stdout_log - stderr_log + create_metadata: + run: ../tools/custom-bash.cwl + in: + input_file: peak_files + param: dataset_names + script: + default: | + #!/bin/bash + set -- "$0" "$@" + COUNT=`expr $# / 2` + echo "| Sample | Index |" > experiment_info.md + echo "| -- | -- |" >> experiment_info.md + j=1 + for i in "${@:$COUNT+1:$#}"; do + echo "Add $i as $count" + echo "| $i | $j |" >> experiment_info.md + (( j++ )) + done; + out: + - output_file + $namespaces: s: http://schema.org/ From 99deb2fd57e771d5b53ab210d532414b155d40d1 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 20 Apr 2023 17:01:15 -0400 Subject: [PATCH 019/162] Change columns alignment --- workflows/diffbind-multi-factor.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/diffbind-multi-factor.cwl b/workflows/diffbind-multi-factor.cwl index 397a2523..a9c32c7a 100644 --- a/workflows/diffbind-multi-factor.cwl +++ b/workflows/diffbind-multi-factor.cwl @@ -940,7 +940,7 @@ steps: set -- "$0" "$@" COUNT=`expr $# / 2` echo "| Sample | Index |" > experiment_info.md - echo "| -- | -- |" >> experiment_info.md + echo "| :-- | --: |" >> experiment_info.md j=1 for i in "${@:$COUNT+1:$#}"; do echo "Add $i as $count" From d67db912cbb74adb55f599e19c56a571f1c44279 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 27 Apr 2023 15:20:20 -0400 Subject: [PATCH 020/162] Copied deeptools and diffbind filtering workflow from the custom the main --- workflows/filter-diffbind-for-heatmap.cwl | 171 +++++++++++ workflows/heatmap-deeptools.cwl | 350 ++++++++++++++++++++++ 2 files changed, 521 insertions(+) create mode 100644 workflows/filter-diffbind-for-heatmap.cwl create mode 100644 workflows/heatmap-deeptools.cwl diff --git a/workflows/filter-diffbind-for-heatmap.cwl b/workflows/filter-diffbind-for-heatmap.cwl new file mode 100644 index 00000000..ab611e45 --- /dev/null +++ b/workflows/filter-diffbind-for-heatmap.cwl @@ -0,0 +1,171 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: InlineJavascriptRequirement + - class: MultipleInputFeatureRequirement + + +'sd:upstream': + sample_to_filter: + - "diffbind.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name/alias" + sd:preview: + position: 1 + + feature_file: + type: File + format: "http://edamontology.org/format_3475" + label: "Differential Binding Analysis experiment" + doc: "Differential binding analysis results exported as TSV" + 'sd:upstreamSource': "sample_to_filter/diffbind_report_file" + 'sd:localLabel': true + + sql_query: + type: string + label: "Filtering parameters" + doc: "Filtering parameters (WHERE parameters for SQL query)" + 'sd:filtering': + params: + columns: ["Refseq_id", "Gene_id", "txStart", "txEnd", "Strand", "Region", "Chr", "Start", "End", "Conc", "Conc1", "Conc2", "Fold", "p-value", "FDR", "Called1", "Called2"] + types: ["string", "string", "number", "number", "string", "string", "string", "number", "number", "number", "number", "number", "number", "number", "number","number", "number"] + + header: + type: boolean? + default: false + label: "Include header line" + doc: "Print header line in the output file" + 'sd:layout': + advanced: true + + columns: + type: + - "null" + - string[] + default: ["Chr", "Start", "End"] + label: "Columns to print" + doc: | + List of columns to print (SELECT parameters for SQL query). + 'sd:layout': + advanced: true + + +outputs: + + filtered_file: + type: File + format: "http://edamontology.org/format_3003" + label: "Filtered regions" + doc: "Filtered regions of interest by default formatted as headerless BED file with [Chr Start End]" + outputSource: feature_select/filtered_file + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'Filtering results' + Title: 'Filtered table' + + filtering_stdout_log: + type: File + format: "http://edamontology.org/format_2330" + label: "Filtering stdout log" + doc: "Filtering stdout log" + outputSource: feature_select/stdout_log + + filtering_stderr_log: + type: File + format: "http://edamontology.org/format_2330" + label: "Filtering stderr log" + doc: "Filtering stderr log" + outputSource: feature_select/stderr_log + + +steps: + + rename_header: + run: ../tools/custom-bash.cwl + in: + input_file: feature_file + script: + default: | + echo "Replacing header to include Conc1 and Conc2 instead of Conc_[group1] and Conc_[group2]" + cat "$0" | grep -v "Refseq_id" | cut -f 1-17 > headerless_report.tsv + echo -e "Refseq_id\tGene_id\ttxStart\ttxEnd\tStrand\tRegion\tChr\tStart\tEnd\tConc\tConc1\tConc2\tFold\tp-value\tFDR\tCalled1\tCalled2" > `basename $0` + cat headerless_report.tsv >> `basename $0` + rm -f headerless_report.tsv + head `basename $0` + out: + - output_file + + feature_select: + run: ../tools/feature-select-sql.cwl + in: + feature_file: rename_header/output_file + sql_query: sql_query + columns: + source: columns + valueFrom: $("DISTINCT " + self.join(", ")) # multiple peaks can have the same coordinates but different abssummit, so we need to use DISTINCT + header: header + out: + - filtered_file + - stdout_log + - stderr_log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +s:name: "Filter DiffBind results for deepTools heatmap analysis" +label: "Filter DiffBind results for deepTools heatmap analysis" +s:alternateName: "Filter differentially bound sites from DiffBind analysis to be used with deepTools heatmap analysis" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/filter-diffbind-for-heatmap.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Filter DiffBind results for deepTools heatmap analysis + ====================================================== + + Filter differentially bound sites from DiffBind analysis to be used with deepTools heatmap analysis diff --git a/workflows/heatmap-deeptools.cwl b/workflows/heatmap-deeptools.cwl new file mode 100644 index 00000000..9fcedfb5 --- /dev/null +++ b/workflows/heatmap-deeptools.cwl @@ -0,0 +1,350 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: StepInputExpressionRequirement + - class: InlineJavascriptRequirement + - class: MultipleInputFeatureRequirement + - class: SubworkflowFeatureRequirement + + +'sd:upstream': + filtered_regions_sample: + - "filter-diffbind-for-heatmap.cwl" + chipseq_sample: + - "trim-chipseq-se.cwl" + - "trim-chipseq-pe.cwl" + - "trim-atacseq-se.cwl" + - "trim-atacseq-pe.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name/alias" + sd:preview: + position: 1 + + scores_files: + type: File[] + format: "http://edamontology.org/format_3006" + label: "ChIP/ATAC-Seq sample(s)" + doc: "bigWig file from ChIP/ATAC-Seq sample(s)" + 'sd:upstreamSource': "chipseq_sample/bigwig" + 'sd:localLabel': true + + scores_labels: + type: string[] + label: "ChIP/ATAC-Seq sample(s)" + doc: "Aliases for ChIP/ATAC-Seq sample(s)" + 'sd:upstreamSource': "chipseq_sample/alias" + + regions_files: + type: File[] + format: "http://edamontology.org/format_3003" + label: "Filtered DiffBind results sample(s)" + doc: "File(s) generated by Filter DiffBind results for deepTools heatmap analysis pipeline" + 'sd:upstreamSource': "filtered_regions_sample/filtered_file" + 'sd:localLabel': true + + regions_labels: + type: string[] + label: "Filtered DiffBind results sample(s)" + doc: "Aliases for file(s) generated by Filter DiffBind results for deepTools heatmap analysis pipeline" + 'sd:upstreamSource': "filtered_regions_sample/alias" + 'sd:localLabel': true + + before_region_start_length: + type: int? + default: 5000 + label: "Distance upstream of the reference-point selected" + doc: | + Distance upstream of the reference-point selected. + 'sd:layout': + advanced: true + + after_region_start_length: + type: int? + default: 5000 + label: "Distance downstream of the reference-point selected" + doc: | + Distance downstream of the reference-point selected. + 'sd:layout': + advanced: true + + bin_size: + type: int? + default: 10 + label: "Length, in bases, of the non-overlapping bins for averaging the score over the regions length" + doc: | + Length, in bases, of the non-overlapping bins for averaging the score over + the regions length. + 'sd:layout': + advanced: true + + plot_type: + type: + - "null" + - type: enum + symbols: + - "lines" + - "fill" + - "se" + - "std" + default: "lines" + label: "Plot type to display" + doc: | + “lines” will plot the profile line based on the average type selected. + “fill” fills the region between zero and the profile curve. The fill in + color is semi transparent to distinguish different profiles. “se” and + “std” color the region between the profile and the standard error or + standard deviation of the data. + 'sd:layout': + advanced: true + + sort_regions: + type: + - "null" + - type: enum + symbols: + - "descend" + - "ascend" + - "no" + - "keep" + default: "descend" + label: "Sorting order for regions" + doc: | + Whether the heatmap should present the regions sorted. The default is to sort in + descending order based on the mean value per region. Note that “keep” and “no” are + the same thing. + 'sd:layout': + advanced: true + + what_to_show: + type: + - "null" + - type: enum + symbols: + - plot, heatmap and colorbar + - plot and heatmap + - heatmap only + - heatmap and colorbar + default: "plot, heatmap and colorbar" + label: "What show on the plot" + doc: | + The default is to include a summary or profile plot on top of the heatmap and a heatmap colorbar. + Other options are: “plot and heatmap”, “heatmap only”, “heatmap and colorbar”, and the default “plot, + heatmap and colorbar”. + 'sd:layout': + advanced: true + + per_group: + type: boolean? + default: false + label: "Plot all samples by group of regions instead of groups of regions by sample" + doc: | + The default is to plot all groups of regions by sample. Using this option instead plots all + samples by group of regions. Note that this is only useful if you have multiple groups of + regions by sample rather than group. + 'sd:layout': + advanced: true + + threads: + type: int? + default: 1 + label: "Number of threads" + doc: "Number of threads for those steps that support multithreading" + 'sd:layout': + advanced: true + + +outputs: + + scores_matrix: + type: File + outputSource: compute_score_matrix/scores_matrix + label: "Scores per genome regions matrix" + doc: | + Scores per genome regions matrix. This file that can be used + with plotHeatmap and plotProfiles. + + heatmap_png: + type: File + outputSource: make_heatmap/heatmap_file + label: "Heatmap for scores around centers of provided regions" + doc: | + Heatmap for scores around centers of provided regions. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Plots' + Caption: 'Heatmap for scores around centers of provided regions' + + compute_score_matrix_stdout_log: + type: File + outputSource: compute_score_matrix/stdout_log + label: "compute_score_matrix stdout log" + doc: | + compute_score_matrix stdout log + + compute_score_matrix_stderr_log: + type: File + outputSource: compute_score_matrix/stderr_log + label: "compute_score_matrix stderr log" + doc: | + compute_score_matrix stderr log + + make_heatmap_stdout_log: + type: File + outputSource: make_heatmap/stdout_log + label: "make_heatmap stdout log" + doc: | + make_heatmap stdout log + + make_heatmap_stderr_log: + type: File + outputSource: make_heatmap/stderr_log + label: "make_heatmap stderr log" + doc: | + make_heatmap stderr log + + +steps: + + recenter_regions: + run: + cwlVersion: v1.0 + class: Workflow + requirements: + - class: ScatterFeatureRequirement + inputs: + regions_files: + type: File[] + outputs: + recentered_regions_files: + type: File[] + outputSource: recenter/output_file + steps: + recenter: + run: ../tools/custom-bash.cwl + in: + input_file: regions_files + script: + default: | + # chrom start end + echo "Recenter by the peak center" + cat "$0" | tr -d "\r" | tr "," "\t" | awk NF | sort -u -k1,1 -k2,2n -k3,3n | awk '{center=$2+int(($3-$2)/2); print $1"\t"center"\t"center+1}' > "${RANDOM}_"`basename $0` + scatter: input_file + out: + - output_file + in: + regions_files: regions_files + out: + - recentered_regions_files + + compute_score_matrix: + run: ../tools/deeptools-computematrix-referencepoint.cwl + in: + score_files: scores_files + regions_files: recenter_regions/recentered_regions_files + reference_point: + default: "TSS" # doesn't matter what we set here because we centered regions ourlselves + before_region_start_length: before_region_start_length + after_region_start_length: after_region_start_length + bin_size: bin_size + sort_regions: sort_regions + samples_label: scores_labels + output_filename: + default: "score_matrix.gz" + missing_data_as_zero: + default: true + threads: threads + out: + - scores_matrix + - stdout_log + - stderr_log + + make_heatmap: + run: ../tools/deeptools-plotheatmap.cwl + in: + plot_title: + default: "Tag density around peak centers" + scores_matrix: compute_score_matrix/scores_matrix + output_filename: + default: "score_heatmap.png" + plot_type: plot_type + sort_regions: sort_regions + average_type_summary_plot: + default: "mean" + what_to_show: what_to_show + ref_point_label: + default: "Peak Center" + regions_label: regions_labels + samples_label: scores_labels + x_axis_label: + default: "distance (bp)" + y_axisLabel: + default: "Signal mean" + per_group: per_group + plot_file_format: + default: "png" + legend_location: + default: "upper-left" + out: + - heatmap_file + - stdout_log + - stderr_log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +s:name: "deepTools - heatmap for scores associated with genomic regions" +label: "deepTools - heatmap for scores associated with genomic regions" +s:alternateName: "Plots heatmap for scores around centers of provided regions" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/heatmap-deeptools.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + deepTools - heatmap for scores associated with genomic regions + ====================================================== + + Plots heatmap for scores around centers of provided regions \ No newline at end of file From 9e115d5a278804f52a0939328d57fbdcfb807125 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 2 May 2023 18:38:41 -0400 Subject: [PATCH 021/162] Set default ~ 1 reduced formula to make DESeq run LRT --- workflows/deseq-multi-factor.cwl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/workflows/deseq-multi-factor.cwl b/workflows/deseq-multi-factor.cwl index fe3f4cd0..7365552c 100644 --- a/workflows/deseq-multi-factor.cwl +++ b/workflows/deseq-multi-factor.cwl @@ -95,11 +95,13 @@ inputs: reduced_formula: type: string? - label: "Reduced formula. If provided, use LRT instead of Wald." + default: "~ 1" + label: "Reduced formula. If removed, force to use Wald instead of LRT." doc: | Reduced formula with the term(s) of interest removed. - Should start with ~. If provided, force DESeq2 to run - LRT test instead of the Wald. + Should start with ~. If design formula includes only + one criteria, use ~ 1. If removed, forces DESeq2 to run + Wald test instead of the LRT. contrast: type: string? From e7a614237ce5a47938edc9b6f9b5f7b92e0cada8 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 2 May 2023 18:40:05 -0400 Subject: [PATCH 022/162] Remove two workflows that will be included in the separate PR --- workflows/filter-diffbind-for-heatmap.cwl | 171 ----------- workflows/heatmap-deeptools.cwl | 350 ---------------------- 2 files changed, 521 deletions(-) delete mode 100644 workflows/filter-diffbind-for-heatmap.cwl delete mode 100644 workflows/heatmap-deeptools.cwl diff --git a/workflows/filter-diffbind-for-heatmap.cwl b/workflows/filter-diffbind-for-heatmap.cwl deleted file mode 100644 index ab611e45..00000000 --- a/workflows/filter-diffbind-for-heatmap.cwl +++ /dev/null @@ -1,171 +0,0 @@ -cwlVersion: v1.0 -class: Workflow - - -requirements: - - class: SubworkflowFeatureRequirement - - class: StepInputExpressionRequirement - - class: InlineJavascriptRequirement - - class: MultipleInputFeatureRequirement - - -'sd:upstream': - sample_to_filter: - - "diffbind.cwl" - - -inputs: - - alias: - type: string - label: "Experiment short name/alias" - sd:preview: - position: 1 - - feature_file: - type: File - format: "http://edamontology.org/format_3475" - label: "Differential Binding Analysis experiment" - doc: "Differential binding analysis results exported as TSV" - 'sd:upstreamSource': "sample_to_filter/diffbind_report_file" - 'sd:localLabel': true - - sql_query: - type: string - label: "Filtering parameters" - doc: "Filtering parameters (WHERE parameters for SQL query)" - 'sd:filtering': - params: - columns: ["Refseq_id", "Gene_id", "txStart", "txEnd", "Strand", "Region", "Chr", "Start", "End", "Conc", "Conc1", "Conc2", "Fold", "p-value", "FDR", "Called1", "Called2"] - types: ["string", "string", "number", "number", "string", "string", "string", "number", "number", "number", "number", "number", "number", "number", "number","number", "number"] - - header: - type: boolean? - default: false - label: "Include header line" - doc: "Print header line in the output file" - 'sd:layout': - advanced: true - - columns: - type: - - "null" - - string[] - default: ["Chr", "Start", "End"] - label: "Columns to print" - doc: | - List of columns to print (SELECT parameters for SQL query). - 'sd:layout': - advanced: true - - -outputs: - - filtered_file: - type: File - format: "http://edamontology.org/format_3003" - label: "Filtered regions" - doc: "Filtered regions of interest by default formatted as headerless BED file with [Chr Start End]" - outputSource: feature_select/filtered_file - 'sd:visualPlugins': - - syncfusiongrid: - tab: 'Filtering results' - Title: 'Filtered table' - - filtering_stdout_log: - type: File - format: "http://edamontology.org/format_2330" - label: "Filtering stdout log" - doc: "Filtering stdout log" - outputSource: feature_select/stdout_log - - filtering_stderr_log: - type: File - format: "http://edamontology.org/format_2330" - label: "Filtering stderr log" - doc: "Filtering stderr log" - outputSource: feature_select/stderr_log - - -steps: - - rename_header: - run: ../tools/custom-bash.cwl - in: - input_file: feature_file - script: - default: | - echo "Replacing header to include Conc1 and Conc2 instead of Conc_[group1] and Conc_[group2]" - cat "$0" | grep -v "Refseq_id" | cut -f 1-17 > headerless_report.tsv - echo -e "Refseq_id\tGene_id\ttxStart\ttxEnd\tStrand\tRegion\tChr\tStart\tEnd\tConc\tConc1\tConc2\tFold\tp-value\tFDR\tCalled1\tCalled2" > `basename $0` - cat headerless_report.tsv >> `basename $0` - rm -f headerless_report.tsv - head `basename $0` - out: - - output_file - - feature_select: - run: ../tools/feature-select-sql.cwl - in: - feature_file: rename_header/output_file - sql_query: sql_query - columns: - source: columns - valueFrom: $("DISTINCT " + self.join(", ")) # multiple peaks can have the same coordinates but different abssummit, so we need to use DISTINCT - header: header - out: - - filtered_file - - stdout_log - - stderr_log - - -$namespaces: - s: http://schema.org/ - -$schemas: -- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf - -s:name: "Filter DiffBind results for deepTools heatmap analysis" -label: "Filter DiffBind results for deepTools heatmap analysis" -s:alternateName: "Filter differentially bound sites from DiffBind analysis to be used with deepTools heatmap analysis" - -s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/filter-diffbind-for-heatmap.cwl -s:codeRepository: https://github.com/Barski-lab/workflows-datirium -s:license: http://www.apache.org/licenses/LICENSE-2.0 - -s:isPartOf: - class: s:CreativeWork - s:name: Common Workflow Language - s:url: http://commonwl.org/ - -s:creator: -- class: s:Organization - s:legalName: "Cincinnati Children's Hospital Medical Center" - s:location: - - class: s:PostalAddress - s:addressCountry: "USA" - s:addressLocality: "Cincinnati" - s:addressRegion: "OH" - s:postalCode: "45229" - s:streetAddress: "3333 Burnet Ave" - s:telephone: "+1(513)636-4200" - s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" - s:department: - - class: s:Organization - s:legalName: "Allergy and Immunology" - s:department: - - class: s:Organization - s:legalName: "Barski Research Lab" - s:member: - - class: s:Person - s:name: Michael Kotliar - s:email: mailto:misha.kotliar@gmail.com - s:sameAs: - - id: http://orcid.org/0000-0002-6486-3898 - - -doc: | - Filter DiffBind results for deepTools heatmap analysis - ====================================================== - - Filter differentially bound sites from DiffBind analysis to be used with deepTools heatmap analysis diff --git a/workflows/heatmap-deeptools.cwl b/workflows/heatmap-deeptools.cwl deleted file mode 100644 index 9fcedfb5..00000000 --- a/workflows/heatmap-deeptools.cwl +++ /dev/null @@ -1,350 +0,0 @@ -cwlVersion: v1.0 -class: Workflow - - -requirements: - - class: StepInputExpressionRequirement - - class: InlineJavascriptRequirement - - class: MultipleInputFeatureRequirement - - class: SubworkflowFeatureRequirement - - -'sd:upstream': - filtered_regions_sample: - - "filter-diffbind-for-heatmap.cwl" - chipseq_sample: - - "trim-chipseq-se.cwl" - - "trim-chipseq-pe.cwl" - - "trim-atacseq-se.cwl" - - "trim-atacseq-pe.cwl" - - -inputs: - - alias: - type: string - label: "Experiment short name/alias" - sd:preview: - position: 1 - - scores_files: - type: File[] - format: "http://edamontology.org/format_3006" - label: "ChIP/ATAC-Seq sample(s)" - doc: "bigWig file from ChIP/ATAC-Seq sample(s)" - 'sd:upstreamSource': "chipseq_sample/bigwig" - 'sd:localLabel': true - - scores_labels: - type: string[] - label: "ChIP/ATAC-Seq sample(s)" - doc: "Aliases for ChIP/ATAC-Seq sample(s)" - 'sd:upstreamSource': "chipseq_sample/alias" - - regions_files: - type: File[] - format: "http://edamontology.org/format_3003" - label: "Filtered DiffBind results sample(s)" - doc: "File(s) generated by Filter DiffBind results for deepTools heatmap analysis pipeline" - 'sd:upstreamSource': "filtered_regions_sample/filtered_file" - 'sd:localLabel': true - - regions_labels: - type: string[] - label: "Filtered DiffBind results sample(s)" - doc: "Aliases for file(s) generated by Filter DiffBind results for deepTools heatmap analysis pipeline" - 'sd:upstreamSource': "filtered_regions_sample/alias" - 'sd:localLabel': true - - before_region_start_length: - type: int? - default: 5000 - label: "Distance upstream of the reference-point selected" - doc: | - Distance upstream of the reference-point selected. - 'sd:layout': - advanced: true - - after_region_start_length: - type: int? - default: 5000 - label: "Distance downstream of the reference-point selected" - doc: | - Distance downstream of the reference-point selected. - 'sd:layout': - advanced: true - - bin_size: - type: int? - default: 10 - label: "Length, in bases, of the non-overlapping bins for averaging the score over the regions length" - doc: | - Length, in bases, of the non-overlapping bins for averaging the score over - the regions length. - 'sd:layout': - advanced: true - - plot_type: - type: - - "null" - - type: enum - symbols: - - "lines" - - "fill" - - "se" - - "std" - default: "lines" - label: "Plot type to display" - doc: | - “lines” will plot the profile line based on the average type selected. - “fill” fills the region between zero and the profile curve. The fill in - color is semi transparent to distinguish different profiles. “se” and - “std” color the region between the profile and the standard error or - standard deviation of the data. - 'sd:layout': - advanced: true - - sort_regions: - type: - - "null" - - type: enum - symbols: - - "descend" - - "ascend" - - "no" - - "keep" - default: "descend" - label: "Sorting order for regions" - doc: | - Whether the heatmap should present the regions sorted. The default is to sort in - descending order based on the mean value per region. Note that “keep” and “no” are - the same thing. - 'sd:layout': - advanced: true - - what_to_show: - type: - - "null" - - type: enum - symbols: - - plot, heatmap and colorbar - - plot and heatmap - - heatmap only - - heatmap and colorbar - default: "plot, heatmap and colorbar" - label: "What show on the plot" - doc: | - The default is to include a summary or profile plot on top of the heatmap and a heatmap colorbar. - Other options are: “plot and heatmap”, “heatmap only”, “heatmap and colorbar”, and the default “plot, - heatmap and colorbar”. - 'sd:layout': - advanced: true - - per_group: - type: boolean? - default: false - label: "Plot all samples by group of regions instead of groups of regions by sample" - doc: | - The default is to plot all groups of regions by sample. Using this option instead plots all - samples by group of regions. Note that this is only useful if you have multiple groups of - regions by sample rather than group. - 'sd:layout': - advanced: true - - threads: - type: int? - default: 1 - label: "Number of threads" - doc: "Number of threads for those steps that support multithreading" - 'sd:layout': - advanced: true - - -outputs: - - scores_matrix: - type: File - outputSource: compute_score_matrix/scores_matrix - label: "Scores per genome regions matrix" - doc: | - Scores per genome regions matrix. This file that can be used - with plotHeatmap and plotProfiles. - - heatmap_png: - type: File - outputSource: make_heatmap/heatmap_file - label: "Heatmap for scores around centers of provided regions" - doc: | - Heatmap for scores around centers of provided regions. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Plots' - Caption: 'Heatmap for scores around centers of provided regions' - - compute_score_matrix_stdout_log: - type: File - outputSource: compute_score_matrix/stdout_log - label: "compute_score_matrix stdout log" - doc: | - compute_score_matrix stdout log - - compute_score_matrix_stderr_log: - type: File - outputSource: compute_score_matrix/stderr_log - label: "compute_score_matrix stderr log" - doc: | - compute_score_matrix stderr log - - make_heatmap_stdout_log: - type: File - outputSource: make_heatmap/stdout_log - label: "make_heatmap stdout log" - doc: | - make_heatmap stdout log - - make_heatmap_stderr_log: - type: File - outputSource: make_heatmap/stderr_log - label: "make_heatmap stderr log" - doc: | - make_heatmap stderr log - - -steps: - - recenter_regions: - run: - cwlVersion: v1.0 - class: Workflow - requirements: - - class: ScatterFeatureRequirement - inputs: - regions_files: - type: File[] - outputs: - recentered_regions_files: - type: File[] - outputSource: recenter/output_file - steps: - recenter: - run: ../tools/custom-bash.cwl - in: - input_file: regions_files - script: - default: | - # chrom start end - echo "Recenter by the peak center" - cat "$0" | tr -d "\r" | tr "," "\t" | awk NF | sort -u -k1,1 -k2,2n -k3,3n | awk '{center=$2+int(($3-$2)/2); print $1"\t"center"\t"center+1}' > "${RANDOM}_"`basename $0` - scatter: input_file - out: - - output_file - in: - regions_files: regions_files - out: - - recentered_regions_files - - compute_score_matrix: - run: ../tools/deeptools-computematrix-referencepoint.cwl - in: - score_files: scores_files - regions_files: recenter_regions/recentered_regions_files - reference_point: - default: "TSS" # doesn't matter what we set here because we centered regions ourlselves - before_region_start_length: before_region_start_length - after_region_start_length: after_region_start_length - bin_size: bin_size - sort_regions: sort_regions - samples_label: scores_labels - output_filename: - default: "score_matrix.gz" - missing_data_as_zero: - default: true - threads: threads - out: - - scores_matrix - - stdout_log - - stderr_log - - make_heatmap: - run: ../tools/deeptools-plotheatmap.cwl - in: - plot_title: - default: "Tag density around peak centers" - scores_matrix: compute_score_matrix/scores_matrix - output_filename: - default: "score_heatmap.png" - plot_type: plot_type - sort_regions: sort_regions - average_type_summary_plot: - default: "mean" - what_to_show: what_to_show - ref_point_label: - default: "Peak Center" - regions_label: regions_labels - samples_label: scores_labels - x_axis_label: - default: "distance (bp)" - y_axisLabel: - default: "Signal mean" - per_group: per_group - plot_file_format: - default: "png" - legend_location: - default: "upper-left" - out: - - heatmap_file - - stdout_log - - stderr_log - - -$namespaces: - s: http://schema.org/ - -$schemas: -- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf - -s:name: "deepTools - heatmap for scores associated with genomic regions" -label: "deepTools - heatmap for scores associated with genomic regions" -s:alternateName: "Plots heatmap for scores around centers of provided regions" - -s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/heatmap-deeptools.cwl -s:codeRepository: https://github.com/Barski-lab/workflows-datirium -s:license: http://www.apache.org/licenses/LICENSE-2.0 - -s:isPartOf: - class: s:CreativeWork - s:name: Common Workflow Language - s:url: http://commonwl.org/ - -s:creator: -- class: s:Organization - s:legalName: "Cincinnati Children's Hospital Medical Center" - s:location: - - class: s:PostalAddress - s:addressCountry: "USA" - s:addressLocality: "Cincinnati" - s:addressRegion: "OH" - s:postalCode: "45229" - s:streetAddress: "3333 Burnet Ave" - s:telephone: "+1(513)636-4200" - s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" - s:department: - - class: s:Organization - s:legalName: "Allergy and Immunology" - s:department: - - class: s:Organization - s:legalName: "Barski Research Lab" - s:member: - - class: s:Person - s:name: Michael Kotliar - s:email: mailto:misha.kotliar@gmail.com - s:sameAs: - - id: http://orcid.org/0000-0002-6486-3898 - - -doc: | - deepTools - heatmap for scores associated with genomic regions - ====================================================== - - Plots heatmap for scores around centers of provided regions \ No newline at end of file From 24072a788331085f47a2d6f2485f98a1743b20b9 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 3 May 2023 15:27:32 -0400 Subject: [PATCH 023/162] Add Fastq-Dump workflow --- tools/fastq-dump.cwl | 264 +++++++++++++++++++++++++++++++++++ workflows/fastq-download.cwl | 215 ++++++++++++++++++++++++++++ 2 files changed, 479 insertions(+) create mode 100644 tools/fastq-dump.cwl create mode 100644 workflows/fastq-download.cwl diff --git a/tools/fastq-dump.cwl b/tools/fastq-dump.cwl new file mode 100644 index 00000000..89ba1a20 --- /dev/null +++ b/tools/fastq-dump.cwl @@ -0,0 +1,264 @@ +cwlVersion: v1.0 +class: CommandLineTool + +requirements: +- class: InlineJavascriptRequirement + +hints: +- class: DockerRequirement + dockerPull: pegi3s/sratoolkit:3.0.1 + + +inputs: + + srr_id: + type: string + inputBinding: + position: 60 + doc: | + SRR identifier + + split_files: + type: boolean? + inputBinding: + position: 10 + prefix: "--split-files" + doc: | + Write reads into separate files. Read + number will be suffixed to the file name. + NOTE! The `--split-3` option is recommended. + In cases where not all spots have the same + number of reads, this option will produce + files that WILL CAUSE ERRORS in most programs + which process split pair fastq files. + + split_3: + type: boolean? + inputBinding: + position: 11 + prefix: "--split-3" + doc: | + 3-way splitting for mate-pairs. For each + spot, if there are two biological reads + satisfying filter conditions, the first is + placed in the `*_1.fastq` file, and the + second is placed in the `*_2.fastq` file. If + there is only one biological read + satisfying the filter conditions, it is + placed in the `*.fastq` file.All other + reads in the spot are ignored. + + +outputs: + + fastq_files: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*.gz" + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["fastq-dump", "--gzip", "--log-level", "info"] + +stdout: fastq_dump_stdout.log +stderr: fastq_dump_stderr.log + +successCodes: [1, 3] + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Fastq-Dump" +s:name: "Fastq-Dump" +s:alternateName: "Downloads FASTQ files from the provided SRR identifier" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/fastq-dump.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Fastq-Dump + + Downloads FASTQ files from the provided SRR identifier + + +s:about: | + Usage: + fastq-dump [options] [...] + fastq-dump [options] + + INPUT + -A|--accession Replaces accession derived from in + filename(s) and deflines (only for single + table dump) + --table Table name within cSRA object, default is + "SEQUENCE" + + PROCESSING + + Read Splitting Sequence data may be used in raw form or + split into individual reads + --split-spot Split spots into individual reads + + Full Spot Filters Applied to the full spot independently + of --split-spot + -N|--minSpotId Minimum spot id + -X|--maxSpotId Maximum spot id + --spot-groups <[list]> Filter by SPOT_GROUP (member): name[,...] + -W|--clip Remove adapter sequences from reads + + Common Filters Applied to spots when --split-spot is not + set, otherwise - to individual reads + -M|--minReadLen Filter by sequence length >= + -R|--read-filter <[filter]> Split into files by READ_FILTER value + optionally filter by value: + pass|reject|criteria|redacted + -E|--qual-filter Filter used in early 1000 Genomes data: no + sequences starting or ending with >= 10N + --qual-filter-1 Filter used in current 1000 Genomes data + + Filters based on alignments Filters are active when alignment + data are present + --aligned Dump only aligned sequences + --unaligned Dump only unaligned sequences + --aligned-region Filter by position on genome. Name can + either be accession.version (ex: + NC_000001.10) or file specific name (ex: + "chr1" or "1"). "from" and "to" are 1-based + coordinates + --matepair-distance Filter by distance between matepairs. + Use "unknown" to find matepairs split + between the references. Use from-to to limit + matepair distance on the same reference + + Filters for individual reads Applied only with --split-spot set + --skip-technical Dump only biological reads + + OUTPUT + -O|--outdir Output directory, default is working + directory '.' ) + -Z|--stdout Output to stdout, all split data become + joined into single stream + --gzip Compress output using gzip: deprecated, not + recommended + --bzip2 Compress output using bzip2: deprecated, + not recommended + + Multiple File Options Setting these options will produce more + than 1 file, each of which will be suffixed + according to splitting criteria. + --split-files Write reads into separate files. Read + number will be suffixed to the file name. + NOTE! The `--split-3` option is recommended. + In cases where not all spots have the same + number of reads, this option will produce + files that WILL CAUSE ERRORS in most programs + which process split pair fastq files. + --split-3 3-way splitting for mate-pairs. For each + spot, if there are two biological reads + satisfying filter conditions, the first is + placed in the `*_1.fastq` file, and the + second is placed in the `*_2.fastq` file. If + there is only one biological read + satisfying the filter conditions, it is + placed in the `*.fastq` file.All other + reads in the spot are ignored. + -G|--spot-group Split into files by SPOT_GROUP (member name) + -R|--read-filter <[filter]> Split into files by READ_FILTER value + optionally filter by value: + pass|reject|criteria|redacted + -T|--group-in-dirs Split into subdirectories instead of files + -K|--keep-empty-files Do not delete empty files + + FORMATTING + + Sequence + -C|--dumpcs <[cskey]> Formats sequence using color space (default + for SOLiD),"cskey" may be specified for + translation + -B|--dumpbase Formats sequence using base space (default + for other than SOLiD). + + Quality + -Q|--offset Offset to use for quality conversion, + default is 33 + --fasta <[line width]> FASTA only, no qualities, optional line + wrap width (set to zero for no wrapping) + --suppress-qual-for-cskey suppress quality-value for cskey + + Defline + -F|--origfmt Defline contains only original sequence name + -I|--readids Append read id after spot id as + 'accession.spot.readid' on defline + --helicos Helicos style defline + --defline-seq Defline format specification for sequence. + --defline-qual Defline format specification for quality. + is string of characters and/or + variables. The variables can be one of: $ac + - accession, $si spot id, $sn spot + name, $sg spot group (barcode), $sl spot + length in bases, $ri read number, $rn + read name, $rl read length in bases. '[]' + could be used for an optional output: if + all vars in [] yield empty values whole + group is not printed. Empty value is empty + string or for numeric variables. Ex: + @$sn[_$rn]/$ri '_$rn' is omitted if name + is empty + + OTHER: + --ngc to ngc file + --disable-multithreading disable multithreading + -h|--help Output brief explanation of program usage + -V|--version Display the version of the program + -L|--log-level Logging level as number or enum string One + of (fatal|sys|int|err|warn|info) or (0-5) + Current/default is warn + -v|--verbose Increase the verbosity level of the program + Use multiple times for more verbosity + --ncbi_error_report Control program execution environment + report generation (if implemented). One of + (never|error|always). Default is error + --legacy-report use legacy style 'Written spots' for tool \ No newline at end of file diff --git a/workflows/fastq-download.cwl b/workflows/fastq-download.cwl new file mode 100644 index 00000000..44b1c98b --- /dev/null +++ b/workflows/fastq-download.cwl @@ -0,0 +1,215 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: +- class: SubworkflowFeatureRequirement +- class: StepInputExpressionRequirement +- class: InlineJavascriptRequirement +- class: MultipleInputFeatureRequirement + + +inputs: + + alias: + type: string + label: "Experiment short name/Alias" + sd:preview: + position: 1 + + srr_id: + type: string + label: "SRR Identifier" + doc: | + Single SRR Identifier + + splitby: + type: + - "null" + - type: enum + symbols: + - "Split into all available files" + - "3-way splitting for mate-pairs" + - "Do not split" + default: "3-way splitting for mate-pairs" + label: "Split reads by" + doc: + Split into all available files. + Write reads into separate files. + Read number will be suffixed to + the file name. In cases where not + all spots have the same number of + reads, this option will produce + files that WILL CAUSE ERRORS in + most programs which process split + pair fastq files. + + 3-way splitting for mate-pairs. + For each spot, if there are two + biological reads satisfying filter + conditions, the first is placed in + the `*_1.fastq` file, and the second + is placed in the `*_2.fastq` file. + If there is only one biological read + satisfying the filter conditions, it + is placed in the `*.fastq` file. All + other reads in the spot are ignored. + + Do not split. + Output all reads into as a single + FASTQ file + + +outputs: + + fastq_files: + type: + - "null" + - type: array + items: File + outputSource: fastq_dump/fastq_files + label: "Gzip-compressed FASTQ files" + doc: | + Gzip-compressed FASTQ files + + report_md: + type: File + outputSource: collect_report/output_file + label: "Collected report for downloaded FASTQ files" + doc: | + Collected report for downloaded FASTQ files + in Markdown format + 'sd:visualPlugins': + - markdownView: + tab: 'Overview' + + fastq_dump_stdout_log: + type: File + outputSource: fastq_dump/stdout_log + label: "stdout log generated by fastq_dump" + doc: | + stdout log generated by fastq_dump + + fastq_dump_stderr_log: + type: File + outputSource: fastq_dump/stderr_log + label: "stderr log generated by fastq_dump" + doc: | + stderr log generated by fastq_dump + + +steps: + + fastq_dump: + run: ../tools/fastq-dump.cwl + in: + srr_id: srr_id + split_files: + source: splitby + valueFrom: $(self=="Split into all available files"?true:null) + split_3: + source: splitby + valueFrom: $(self=="3-way splitting for mate-pairs"?true:null) + out: + - fastq_files + - stdout_log + - stderr_log + + collect_report: + run: + cwlVersion: v1.0 + class: CommandLineTool + hints: + - class: DockerRequirement + dockerPull: biowardrobe2/scidap:v0.0.3 + inputs: + script: + type: string? + default: | + #!/bin/bash + set -- "$0" "$@" + if [ "$#" -eq 1 ] && [ "$0" = "/bin/bash" ]; then + echo "Failed to download FASTQ files. Check logs for errors." > report.md + exit 0 + fi + echo "## Collected Report" > report.md + j=1 + for i in "${@}"; do + echo "### `basename $i`" >> report.md + echo "**`zcat $i | wc -l`** lines, **`stat -c%s $i`** bytes" >> report.md + echo "Top 5 reads" >> report.md + echo "\`\`\`" >> report.md + echo "`zcat $i | head -n 20`" >> report.md + echo "\`\`\`" >> report.md + (( j++ )) + done; + inputBinding: + position: 1 + input_file: + type: + - "null" + - type: array + items: File + inputBinding: + position: 2 + outputs: + output_file: + type: File + outputBinding: + glob: "*" + baseCommand: [bash, '-c'] + in: + input_file: fastq_dump/fastq_files + out: + - output_file + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "FASTQ Download" +s:name: "FASTQ Download" +s:alternateName: "Download FASTQ files using fastq-dump from SRA Toolkit" + +s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/fastq-download.cwl +s:codeRepository: https://github.com/datirium/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + FASTQ Download + + Download FASTQ files using fastq-dump from SRA Toolkit From 1a29f8c070ee741bc2852de14c2a6f0952a1028e Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 3 May 2023 16:00:11 -0400 Subject: [PATCH 024/162] Added optiobal proxy settings to fastq-download workflow --- tools/fastq-dump.cwl | 14 ++++++++++++++ workflows/fastq-download.cwl | 22 ++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/tools/fastq-dump.cwl b/tools/fastq-dump.cwl index 89ba1a20..a0e61d5f 100644 --- a/tools/fastq-dump.cwl +++ b/tools/fastq-dump.cwl @@ -3,6 +3,11 @@ class: CommandLineTool requirements: - class: InlineJavascriptRequirement +- class: EnvVarRequirement + envDef: + http_proxy: $(inputs.http_proxy) + https_proxy: $(inputs.https_proxy) + hints: - class: DockerRequirement @@ -48,6 +53,15 @@ inputs: placed in the `*.fastq` file.All other reads in the spot are ignored. + http_proxy: + type: string? + doc: | + Optional HTTP proxy settings + + https_proxy: + type: string? + doc: | + Optional HTTPS proxy settings outputs: diff --git a/workflows/fastq-download.cwl b/workflows/fastq-download.cwl index 44b1c98b..9b4fd0ba 100644 --- a/workflows/fastq-download.cwl +++ b/workflows/fastq-download.cwl @@ -59,6 +59,22 @@ inputs: Output all reads into as a single FASTQ file + http_proxy: + type: string? + label: "Optional HTTP proxy settings" + doc: | + Optional HTTP proxy settings + 'sd:layout': + advanced: true + + https_proxy: + type: string? + label: "Optional HTTPS proxy settings" + doc: | + Optional HTTPS proxy settings + 'sd:layout': + advanced: true + outputs: @@ -110,6 +126,12 @@ steps: split_3: source: splitby valueFrom: $(self=="3-way splitting for mate-pairs"?true:null) + http_proxy: + source: http_proxy + valueFrom: $(self==""?null:self) # safety measure + https_proxy: + source: https_proxy + valueFrom: $(self==""?null:self) # safety measure out: - fastq_files - stdout_log From 0a79960ea802f248bcaf7e3898e39f4a4fad4303 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 3 May 2023 16:02:57 -0400 Subject: [PATCH 025/162] Updated markdown header size --- workflows/fastq-download.cwl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/fastq-download.cwl b/workflows/fastq-download.cwl index 9b4fd0ba..563134d4 100644 --- a/workflows/fastq-download.cwl +++ b/workflows/fastq-download.cwl @@ -154,10 +154,10 @@ steps: echo "Failed to download FASTQ files. Check logs for errors." > report.md exit 0 fi - echo "## Collected Report" > report.md + echo "### Collected Report" > report.md j=1 for i in "${@}"; do - echo "### `basename $i`" >> report.md + echo "#### `basename $i`" >> report.md echo "**`zcat $i | wc -l`** lines, **`stat -c%s $i`** bytes" >> report.md echo "Top 5 reads" >> report.md echo "\`\`\`" >> report.md From 1bb00442e813a33a64d607302e9e333177fc7c22 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 3 May 2023 16:24:34 -0400 Subject: [PATCH 026/162] Not imporatant changes --- workflows/fastq-download.cwl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflows/fastq-download.cwl b/workflows/fastq-download.cwl index 563134d4..19dd4041 100644 --- a/workflows/fastq-download.cwl +++ b/workflows/fastq-download.cwl @@ -158,8 +158,7 @@ steps: j=1 for i in "${@}"; do echo "#### `basename $i`" >> report.md - echo "**`zcat $i | wc -l`** lines, **`stat -c%s $i`** bytes" >> report.md - echo "Top 5 reads" >> report.md + echo "**`zcat $i | wc -l`** lines, **`stat -c%s $i`** bytes, top **5** reads" >> report.md echo "\`\`\`" >> report.md echo "`zcat $i | head -n 20`" >> report.md echo "\`\`\`" >> report.md From 08d12b445195dcfcce256e1cb512272a18ca2b17 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 4 May 2023 19:18:57 -0400 Subject: [PATCH 027/162] Update fastq download pipeline to support multiple SRR identifiers --- tools/fastq-dump.cwl | 72 +++++++++++++++++++++++++++++++++--- workflows/fastq-download.cwl | 68 ++++++++-------------------------- 2 files changed, 82 insertions(+), 58 deletions(-) diff --git a/tools/fastq-dump.cwl b/tools/fastq-dump.cwl index a0e61d5f..aa9502f4 100644 --- a/tools/fastq-dump.cwl +++ b/tools/fastq-dump.cwl @@ -5,9 +5,8 @@ requirements: - class: InlineJavascriptRequirement - class: EnvVarRequirement envDef: - http_proxy: $(inputs.http_proxy) - https_proxy: $(inputs.https_proxy) - + http_proxy: $(inputs.http_proxy?inputs.http_proxy:"") + https_proxy: $(inputs.https_proxy?inputs.https_proxy:"") hints: - class: DockerRequirement @@ -16,12 +15,67 @@ hints: inputs: + script: + type: string? + default: | + #!/bin/bash + set -- "$0" "$@" + + SRA_IDS=() + PARAMS=() + + for i in "$@"; do + if [[ "$i" = "--split-files" ]] || [[ "$i" = "--split-3" ]]; then + echo "Adding param $i" + PARAMS+=($i) + else + echo "Adding SRR $i" + SRA_IDS+=($i) + fi + done; + + echo "### Single files statistics" > single.md + echo "### Merged files statistics" > merged.md + + for SRA in ${SRA_IDS[@]}; do + echo "Downloading $SRA with ${PARAMS[@]}" + fastq-dump --gzip --log-level info ${PARAMS[@]} $SRA + j=1 + for FASTQ in $SRA*.gz; do + echo "#### `basename $FASTQ`" >> single.md + echo "**`zcat $FASTQ | wc -l`** lines, **`stat -c%s $FASTQ`** bytes, top **5** reads" >> single.md + echo "\`\`\`" >> single.md + echo "`zcat $FASTQ | head -n 20`" >> single.md + echo "\`\`\`" >> single.md + echo "Adding $FASTQ to read_$j.fastq.gz" + cat $FASTQ >> read_$j.fastq.gz + rm -f $FASTQ + (( j++ )) + done; + done; + + for MERGED in read*.gz; do + echo "#### `basename $MERGED`" >> merged.md + echo "**`zcat $MERGED | wc -l`** lines, **`stat -c%s $MERGED`** bytes, top **5** reads" >> merged.md + done; + + cat merged.md single.md > report.md + rm -f merged.md single.md + + inputBinding: + position: 1 + doc: | + Bash function to run refgene-sort and atdp + srr_id: - type: string + type: + - string + - type: array + items: string inputBinding: position: 60 doc: | - SRR identifier + SRR identifiers split_files: type: boolean? @@ -73,6 +127,11 @@ outputs: outputBinding: glob: "*.gz" + report_md: + type: File + outputBinding: + glob: "report.md" + stdout_log: type: stdout @@ -80,13 +139,14 @@ outputs: type: stderr -baseCommand: ["fastq-dump", "--gzip", "--log-level", "info"] +baseCommand: ["bash", "-c"] stdout: fastq_dump_stdout.log stderr: fastq_dump_stderr.log successCodes: [1, 3] + $namespaces: s: http://schema.org/ diff --git a/workflows/fastq-download.cwl b/workflows/fastq-download.cwl index 19dd4041..3cce6e0f 100644 --- a/workflows/fastq-download.cwl +++ b/workflows/fastq-download.cwl @@ -5,8 +5,16 @@ class: Workflow requirements: - class: SubworkflowFeatureRequirement - class: StepInputExpressionRequirement -- class: InlineJavascriptRequirement - class: MultipleInputFeatureRequirement +- class: InlineJavascriptRequirement + expressionLib: + - var split_features = function(line) { + function get_unique(value, index, self) { + return self.indexOf(value) === index && value != ""; + } + var splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; inputs: @@ -19,9 +27,9 @@ inputs: srr_id: type: string - label: "SRR Identifier" + label: "Comma or space separated list of SRR Identifiers" doc: | - Single SRR Identifier + Comma or space separated list of SRR Identifiers splitby: type: @@ -90,7 +98,7 @@ outputs: report_md: type: File - outputSource: collect_report/output_file + outputSource: fastq_dump/report_md label: "Collected report for downloaded FASTQ files" doc: | Collected report for downloaded FASTQ files @@ -119,7 +127,9 @@ steps: fastq_dump: run: ../tools/fastq-dump.cwl in: - srr_id: srr_id + srr_id: + source: srr_id + valueFrom: $(split_features(self)) split_files: source: splitby valueFrom: $(self=="Split into all available files"?true:null) @@ -134,56 +144,10 @@ steps: valueFrom: $(self==""?null:self) # safety measure out: - fastq_files + - report_md - stdout_log - stderr_log - collect_report: - run: - cwlVersion: v1.0 - class: CommandLineTool - hints: - - class: DockerRequirement - dockerPull: biowardrobe2/scidap:v0.0.3 - inputs: - script: - type: string? - default: | - #!/bin/bash - set -- "$0" "$@" - if [ "$#" -eq 1 ] && [ "$0" = "/bin/bash" ]; then - echo "Failed to download FASTQ files. Check logs for errors." > report.md - exit 0 - fi - echo "### Collected Report" > report.md - j=1 - for i in "${@}"; do - echo "#### `basename $i`" >> report.md - echo "**`zcat $i | wc -l`** lines, **`stat -c%s $i`** bytes, top **5** reads" >> report.md - echo "\`\`\`" >> report.md - echo "`zcat $i | head -n 20`" >> report.md - echo "\`\`\`" >> report.md - (( j++ )) - done; - inputBinding: - position: 1 - input_file: - type: - - "null" - - type: array - items: File - inputBinding: - position: 2 - outputs: - output_file: - type: File - outputBinding: - glob: "*" - baseCommand: [bash, '-c'] - in: - input_file: fastq_dump/fastq_files - out: - - output_file - $namespaces: s: http://schema.org/ From c50bfccdbcb8caa015eb21ffac08368f0b381af7 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 4 May 2023 19:36:27 -0400 Subject: [PATCH 028/162] Not important changes --- tools/fastq-dump.cwl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/fastq-dump.cwl b/tools/fastq-dump.cwl index aa9502f4..b551342c 100644 --- a/tools/fastq-dump.cwl +++ b/tools/fastq-dump.cwl @@ -56,7 +56,7 @@ inputs: for MERGED in read*.gz; do echo "#### `basename $MERGED`" >> merged.md - echo "**`zcat $MERGED | wc -l`** lines, **`stat -c%s $MERGED`** bytes, top **5** reads" >> merged.md + echo "**`zcat $MERGED | wc -l`** lines, **`stat -c%s $MERGED`** bytes" >> merged.md done; cat merged.md single.md > report.md @@ -335,4 +335,4 @@ s:about: | --ncbi_error_report Control program execution environment report generation (if implemented). One of (never|error|always). Default is error - --legacy-report use legacy style 'Written spots' for tool \ No newline at end of file + --legacy-report use legacy style 'Written spots' for tool From 3a597c4dd95529cd7b1a73edd0f7601723648e56 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 10 May 2023 13:45:17 -0400 Subject: [PATCH 029/162] Update Fastq Download Workflow --- tools/fastq-dump.cwl | 296 ++++++++++++----------------------- workflows/fastq-download.cwl | 86 +++++++++- 2 files changed, 178 insertions(+), 204 deletions(-) diff --git a/tools/fastq-dump.cwl b/tools/fastq-dump.cwl index b551342c..351bca4e 100644 --- a/tools/fastq-dump.cwl +++ b/tools/fastq-dump.cwl @@ -10,63 +10,11 @@ requirements: hints: - class: DockerRequirement - dockerPull: pegi3s/sratoolkit:3.0.1 + dockerPull: biowardrobe2/fastqdwnld:v0.0.1 inputs: - script: - type: string? - default: | - #!/bin/bash - set -- "$0" "$@" - - SRA_IDS=() - PARAMS=() - - for i in "$@"; do - if [[ "$i" = "--split-files" ]] || [[ "$i" = "--split-3" ]]; then - echo "Adding param $i" - PARAMS+=($i) - else - echo "Adding SRR $i" - SRA_IDS+=($i) - fi - done; - - echo "### Single files statistics" > single.md - echo "### Merged files statistics" > merged.md - - for SRA in ${SRA_IDS[@]}; do - echo "Downloading $SRA with ${PARAMS[@]}" - fastq-dump --gzip --log-level info ${PARAMS[@]} $SRA - j=1 - for FASTQ in $SRA*.gz; do - echo "#### `basename $FASTQ`" >> single.md - echo "**`zcat $FASTQ | wc -l`** lines, **`stat -c%s $FASTQ`** bytes, top **5** reads" >> single.md - echo "\`\`\`" >> single.md - echo "`zcat $FASTQ | head -n 20`" >> single.md - echo "\`\`\`" >> single.md - echo "Adding $FASTQ to read_$j.fastq.gz" - cat $FASTQ >> read_$j.fastq.gz - rm -f $FASTQ - (( j++ )) - done; - done; - - for MERGED in read*.gz; do - echo "#### `basename $MERGED`" >> merged.md - echo "**`zcat $MERGED | wc -l`** lines, **`stat -c%s $MERGED`** bytes" >> merged.md - done; - - cat merged.md single.md > report.md - rm -f merged.md single.md - - inputBinding: - position: 1 - doc: | - Bash function to run refgene-sort and atdp - srr_id: type: - string @@ -127,11 +75,99 @@ outputs: outputBinding: glob: "*.gz" + metadata_xml: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*.xml" + report_md: type: File outputBinding: glob: "report.md" + collected_metadata: + type: File + outputBinding: + glob: "collected_metadata.tsv" + + run_acc: + type: + - "null" + - type: array + items: string + outputBinding: + loadContents: true + glob: "collected_metadata.tsv" + outputEval: | + ${ + var pattern = /run_acc\:.*/; + var splitted_line = self[0].contents.match(pattern)[0].trim().split(" ").slice(1); + return (!!splitted_line.length)?splitted_line:null; + } + + experiment_acc: + type: + - "null" + - type: array + items: string + outputBinding: + loadContents: true + glob: "collected_metadata.tsv" + outputEval: | + ${ + var pattern = /experiment_acc\:.*/; + var splitted_line = self[0].contents.match(pattern)[0].trim().split(" ").slice(1); + return (!!splitted_line.length)?splitted_line:null; + } + + study_acc: + type: + - "null" + - type: array + items: string + outputBinding: + loadContents: true + glob: "collected_metadata.tsv" + outputEval: | + ${ + var pattern = /study_acc\:.*/; + var splitted_line = self[0].contents.match(pattern)[0].trim().split(" ").slice(1); + return (!!splitted_line.length)?splitted_line:null; + } + + biosample: + type: + - "null" + - type: array + items: string + outputBinding: + loadContents: true + glob: "collected_metadata.tsv" + outputEval: | + ${ + var pattern = /biosample\:.*/; + var splitted_line = self[0].contents.match(pattern)[0].trim().split(" ").slice(1); + return (!!splitted_line.length)?splitted_line:null; + } + + bioproject: + type: + - "null" + - type: array + items: string + outputBinding: + loadContents: true + glob: "collected_metadata.tsv" + outputEval: | + ${ + var pattern = /bioproject\:.*/; + var splitted_line = self[0].contents.match(pattern)[0].trim().split(" ").slice(1); + return (!!splitted_line.length)?splitted_line:null; + } + stdout_log: type: stdout @@ -139,12 +175,10 @@ outputs: type: stderr -baseCommand: ["bash", "-c"] - -stdout: fastq_dump_stdout.log -stderr: fastq_dump_stderr.log +baseCommand: ["sra_download.sh"] -successCodes: [1, 3] +stdout: sra_download_stdout.log +stderr: sra_download_stderr.log $namespaces: @@ -153,8 +187,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Fastq-Dump" -s:name: "Fastq-Dump" +label: "Fastq-Dump on Steroids" +s:name: "Fastq-Dump on Steroids" s:alternateName: "Downloads FASTQ files from the provided SRR identifier" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/fastq-dump.cwl @@ -193,146 +227,12 @@ s:creator: doc: | - Fastq-Dump + Fastq-Dump on Steroids Downloads FASTQ files from the provided SRR identifier s:about: | - Usage: - fastq-dump [options] [...] - fastq-dump [options] - - INPUT - -A|--accession Replaces accession derived from in - filename(s) and deflines (only for single - table dump) - --table Table name within cSRA object, default is - "SEQUENCE" - - PROCESSING - - Read Splitting Sequence data may be used in raw form or - split into individual reads - --split-spot Split spots into individual reads - - Full Spot Filters Applied to the full spot independently - of --split-spot - -N|--minSpotId Minimum spot id - -X|--maxSpotId Maximum spot id - --spot-groups <[list]> Filter by SPOT_GROUP (member): name[,...] - -W|--clip Remove adapter sequences from reads - - Common Filters Applied to spots when --split-spot is not - set, otherwise - to individual reads - -M|--minReadLen Filter by sequence length >= - -R|--read-filter <[filter]> Split into files by READ_FILTER value - optionally filter by value: - pass|reject|criteria|redacted - -E|--qual-filter Filter used in early 1000 Genomes data: no - sequences starting or ending with >= 10N - --qual-filter-1 Filter used in current 1000 Genomes data - - Filters based on alignments Filters are active when alignment - data are present - --aligned Dump only aligned sequences - --unaligned Dump only unaligned sequences - --aligned-region Filter by position on genome. Name can - either be accession.version (ex: - NC_000001.10) or file specific name (ex: - "chr1" or "1"). "from" and "to" are 1-based - coordinates - --matepair-distance Filter by distance between matepairs. - Use "unknown" to find matepairs split - between the references. Use from-to to limit - matepair distance on the same reference - - Filters for individual reads Applied only with --split-spot set - --skip-technical Dump only biological reads - - OUTPUT - -O|--outdir Output directory, default is working - directory '.' ) - -Z|--stdout Output to stdout, all split data become - joined into single stream - --gzip Compress output using gzip: deprecated, not - recommended - --bzip2 Compress output using bzip2: deprecated, - not recommended - - Multiple File Options Setting these options will produce more - than 1 file, each of which will be suffixed - according to splitting criteria. - --split-files Write reads into separate files. Read - number will be suffixed to the file name. - NOTE! The `--split-3` option is recommended. - In cases where not all spots have the same - number of reads, this option will produce - files that WILL CAUSE ERRORS in most programs - which process split pair fastq files. - --split-3 3-way splitting for mate-pairs. For each - spot, if there are two biological reads - satisfying filter conditions, the first is - placed in the `*_1.fastq` file, and the - second is placed in the `*_2.fastq` file. If - there is only one biological read - satisfying the filter conditions, it is - placed in the `*.fastq` file.All other - reads in the spot are ignored. - -G|--spot-group Split into files by SPOT_GROUP (member name) - -R|--read-filter <[filter]> Split into files by READ_FILTER value - optionally filter by value: - pass|reject|criteria|redacted - -T|--group-in-dirs Split into subdirectories instead of files - -K|--keep-empty-files Do not delete empty files - - FORMATTING - - Sequence - -C|--dumpcs <[cskey]> Formats sequence using color space (default - for SOLiD),"cskey" may be specified for - translation - -B|--dumpbase Formats sequence using base space (default - for other than SOLiD). - - Quality - -Q|--offset Offset to use for quality conversion, - default is 33 - --fasta <[line width]> FASTA only, no qualities, optional line - wrap width (set to zero for no wrapping) - --suppress-qual-for-cskey suppress quality-value for cskey - - Defline - -F|--origfmt Defline contains only original sequence name - -I|--readids Append read id after spot id as - 'accession.spot.readid' on defline - --helicos Helicos style defline - --defline-seq Defline format specification for sequence. - --defline-qual Defline format specification for quality. - is string of characters and/or - variables. The variables can be one of: $ac - - accession, $si spot id, $sn spot - name, $sg spot group (barcode), $sl spot - length in bases, $ri read number, $rn - read name, $rl read length in bases. '[]' - could be used for an optional output: if - all vars in [] yield empty values whole - group is not printed. Empty value is empty - string or for numeric variables. Ex: - @$sn[_$rn]/$ri '_$rn' is omitted if name - is empty - - OTHER: - --ngc to ngc file - --disable-multithreading disable multithreading - -h|--help Output brief explanation of program usage - -V|--version Display the version of the program - -L|--log-level Logging level as number or enum string One - of (fatal|sys|int|err|warn|info) or (0-5) - Current/default is warn - -v|--verbose Increase the verbosity level of the program - Use multiple times for more verbosity - --ncbi_error_report Control program execution environment - report generation (if implemented). One of - (never|error|always). Default is error - --legacy-report use legacy style 'Written spots' for tool + Custom script to first prefetch raw data based on the + provided SRR identifiers and them merge exported from + then FASTQ file. \ No newline at end of file diff --git a/workflows/fastq-download.cwl b/workflows/fastq-download.cwl index 3cce6e0f..01286ca3 100644 --- a/workflows/fastq-download.cwl +++ b/workflows/fastq-download.cwl @@ -107,19 +107,86 @@ outputs: - markdownView: tab: 'Overview' + metadata_xml: + type: + - "null" + - type: array + items: File + outputSource: fastq_dump/metadata_xml + label: "SRR metadata files in XML format" + doc: | + SRR metadata files in XML format + + collected_metadata: + type: File + outputSource: fastq_dump/collected_metadata + label: "Collected metadata in TSV format" + doc: | + Collected metadata in TSV format + + run_acc: + type: + - "null" + - type: array + items: string + outputSource: fastq_dump/run_acc + label: "Collected Run identifiers" + doc: | + Collected Run identifiers + + experiment_acc: + type: + - "null" + - type: array + items: string + outputSource: fastq_dump/experiment_acc + label: "Collected Experiment identifiers" + doc: | + Collected Experiment identifiers + + study_acc: + type: + - "null" + - type: array + items: string + outputSource: fastq_dump/study_acc + label: "Collected SRA Study identifiers" + doc: | + Collected SRA Study identifiers + + biosample: + type: + - "null" + - type: array + items: string + outputSource: fastq_dump/biosample + label: "Collected BioSample identifiers" + doc: | + Collected BioSample identifiers + + bioproject: + type: + - "null" + - type: array + items: string + outputSource: fastq_dump/bioproject + label: "Collected BioProject identifiers" + doc: | + Collected BioProject identifiers + fastq_dump_stdout_log: type: File outputSource: fastq_dump/stdout_log - label: "stdout log generated by fastq_dump" + label: "stdout log generated by fastq_dump step" doc: | - stdout log generated by fastq_dump + stdout log generated by fastq_dump step fastq_dump_stderr_log: type: File outputSource: fastq_dump/stderr_log - label: "stderr log generated by fastq_dump" + label: "stderr log generated by fastq_dump step" doc: | - stderr log generated by fastq_dump + stderr log generated by fastq_dump step steps: @@ -144,6 +211,13 @@ steps: valueFrom: $(self==""?null:self) # safety measure out: - fastq_files + - metadata_xml + - collected_metadata + - run_acc + - experiment_acc + - study_acc + - biosample + - bioproject - report_md - stdout_log - stderr_log @@ -157,7 +231,7 @@ $schemas: label: "FASTQ Download" s:name: "FASTQ Download" -s:alternateName: "Download FASTQ files using fastq-dump from SRA Toolkit" +s:alternateName: "Downloads FASTQ files from the provided SRR identifiers" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/fastq-download.cwl s:codeRepository: https://github.com/datirium/workflows @@ -197,4 +271,4 @@ s:creator: doc: | FASTQ Download - Download FASTQ files using fastq-dump from SRA Toolkit + Downloads FASTQ files from the provided SRR identifiers From faec6bc72e84a7da4eb5c0d926762bfab5123d9a Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 23 May 2023 13:19:00 -0400 Subject: [PATCH 030/162] Mark deprecated pipelines --- workflows/altanalyze-cellharmony.cwl | 7 +++---- workflows/altanalyze-icgs.cwl | 7 +++---- workflows/altanalyze-prepare-genome.cwl | 7 +++---- workflows/chipseq-pe.cwl | 4 ++-- workflows/chipseq-se.cwl | 4 ++-- workflows/rnaseq-pe-dutp.cwl | 4 ++-- workflows/rnaseq-pe.cwl | 4 ++-- workflows/rnaseq-se-dutp.cwl | 4 ++-- workflows/rnaseq-se.cwl | 4 ++-- workflows/sc-assign-cell-types.cwl | 8 ++++---- workflows/sc_diff_expr.cwl | 8 ++++---- workflows/seurat-cluster.cwl | 8 ++++---- workflows/single-cell-preprocess.cwl | 4 ++-- 13 files changed, 35 insertions(+), 38 deletions(-) diff --git a/workflows/altanalyze-cellharmony.cwl b/workflows/altanalyze-cellharmony.cwl index a0dda12e..35e1e607 100644 --- a/workflows/altanalyze-cellharmony.cwl +++ b/workflows/altanalyze-cellharmony.cwl @@ -181,8 +181,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -s:name: "AltAnalyze CellHarmony" -label: "AltAnalyze CellHarmony" +s:name: "Deprecated. AltAnalyze CellHarmony" +label: "Deprecated. AltAnalyze CellHarmony" s:alternateName: "Runs cell-level matching and comparison of single-cell transcriptomes for AltAnalyze ICGS, Cell Ranger Count Gene Expression or Cell Ranger Aggregate experiments" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/altanalyze-cellharmony.cwl @@ -221,5 +221,4 @@ s:creator: doc: | - AltAnalyze CellHarmony - ====================== \ No newline at end of file + Deprecated. AltAnalyze CellHarmony diff --git a/workflows/altanalyze-icgs.cwl b/workflows/altanalyze-icgs.cwl index 45b50eb9..387ad46f 100644 --- a/workflows/altanalyze-icgs.cwl +++ b/workflows/altanalyze-icgs.cwl @@ -247,8 +247,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -s:name: "AltAnalyze ICGS" -label: "AltAnalyze ICGS" +s:name: "Deprecated. AltAnalyze ICGS" +label: "Deprecated. AltAnalyze ICGS" s:alternateName: "Runs iterative clustering and guide-gene selection for Cell Ranger Count Gene Expression or Cell Ranger Aggregate experiments" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/altanalyze-icgs.cwl @@ -287,5 +287,4 @@ s:creator: doc: | - AltAnalyze ICGS - =============== \ No newline at end of file + Deprecated. AltAnalyze ICGS diff --git a/workflows/altanalyze-prepare-genome.cwl b/workflows/altanalyze-prepare-genome.cwl index 2bbc69d6..833f974d 100644 --- a/workflows/altanalyze-prepare-genome.cwl +++ b/workflows/altanalyze-prepare-genome.cwl @@ -70,8 +70,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -s:name: "AltAnalyze Build Reference Indices" -label: "AltAnalyze Build Reference Indices" +s:name: "Deprecated. AltAnalyze Build Reference Indices" +label: "Deprecated. AltAnalyze Build Reference Indices" s:alternateName: "Builds reference genome indices for AltAnalyze ICGS and AltAnalyze CellHarmony experiments" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/altanalyze-prepare-genome.cwl @@ -110,5 +110,4 @@ s:creator: doc: | - AltAnalyze Build Reference Indices - ================================== \ No newline at end of file + Deprecated. AltAnalyze Build Reference Indices diff --git a/workflows/chipseq-pe.cwl b/workflows/chipseq-pe.cwl index c0f0b1a3..86d01f12 100644 --- a/workflows/chipseq-pe.cwl +++ b/workflows/chipseq-pe.cwl @@ -736,8 +736,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "ChIP-Seq pipeline paired-end" -s:name: "ChIP-Seq pipeline paired-end" +label: "Deprecated. ChIP-Seq pipeline paired-end" +s:name: "Deprecated. ChIP-Seq pipeline paired-end" s:alternateName: "ChIP-Seq basic analysis workflow for a paired-end experiment" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/chipseq-pe.cwl diff --git a/workflows/chipseq-se.cwl b/workflows/chipseq-se.cwl index 992e97a7..d3409eaa 100644 --- a/workflows/chipseq-se.cwl +++ b/workflows/chipseq-se.cwl @@ -599,8 +599,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "ChIP-Seq pipeline single-read" -s:name: "ChIP-Seq pipeline single-read" +label: "Deprecated. ChIP-Seq pipeline single-read" +s:name: "Deprecated. ChIP-Seq pipeline single-read" s:alternateName: "ChIP-Seq basic analysis workflow for single-read data" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/chipseq-se.cwl diff --git a/workflows/rnaseq-pe-dutp.cwl b/workflows/rnaseq-pe-dutp.cwl index 8810dedf..ed05ada3 100644 --- a/workflows/rnaseq-pe-dutp.cwl +++ b/workflows/rnaseq-pe-dutp.cwl @@ -529,8 +529,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -s:name: "RNA-Seq pipeline paired-end strand specific" -label: "RNA-Seq pipeline paired-end strand specific" +s:name: "Deprecated. RNA-Seq pipeline paired-end strand specific" +label: "Deprecated. RNA-Seq pipeline paired-end strand specific" s:alternateName: "RNA-Seq basic analysis workflow for strand specific paired-end experiment" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/rnaseq-pe-dutp.cwl diff --git a/workflows/rnaseq-pe.cwl b/workflows/rnaseq-pe.cwl index b2eff308..3e780e30 100644 --- a/workflows/rnaseq-pe.cwl +++ b/workflows/rnaseq-pe.cwl @@ -482,8 +482,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -s:name: "RNA-Seq pipeline paired-end" -label: "RNA-Seq pipeline paired-end" +s:name: "Deprecated. RNA-Seq pipeline paired-end" +label: "Deprecated. RNA-Seq pipeline paired-end" s:alternateName: "RNA-Seq basic analysis workflow for paired-end experiment" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/rnaseq-pe.cwl diff --git a/workflows/rnaseq-se-dutp.cwl b/workflows/rnaseq-se-dutp.cwl index 66430f35..74321f63 100644 --- a/workflows/rnaseq-se-dutp.cwl +++ b/workflows/rnaseq-se-dutp.cwl @@ -464,8 +464,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -s:name: "RNA-Seq pipeline single-read strand specific" -label: "RNA-Seq pipeline single-read strand specific" +s:name: "Deprecated. RNA-Seq pipeline single-read strand specific" +label: "Deprecated. RNA-Seq pipeline single-read strand specific" s:alternateName: "RNA-Seq basic analysis workflow for strand specific single-read experiment" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/rnaseq-se-dutp.cwl diff --git a/workflows/rnaseq-se.cwl b/workflows/rnaseq-se.cwl index 77e34f63..5d50ecf9 100644 --- a/workflows/rnaseq-se.cwl +++ b/workflows/rnaseq-se.cwl @@ -418,8 +418,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -s:name: "RNA-Seq pipeline single-read" -label: "RNA-Seq pipeline single-read" +s:name: "Deprecated. RNA-Seq pipeline single-read" +label: "Deprecated. RNA-Seq pipeline single-read" s:alternateName: "RNA-Seq basic analysis workflow for single-read experiment" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/rnaseq-se.cwl diff --git a/workflows/sc-assign-cell-types.cwl b/workflows/sc-assign-cell-types.cwl index 82bfc27c..98278dd4 100644 --- a/workflows/sc-assign-cell-types.cwl +++ b/workflows/sc-assign-cell-types.cwl @@ -291,8 +291,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell Assign Cell Types" -s:name: "Single-cell Assign Cell Types" +label: "Deprecated. Single-cell Assign Cell Types" +s:name: "Deprecated. Single-cell Assign Cell Types" s:alternateName: "Assigns cell types to Seurat clusters" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/sc-assign-cell-types.cwl @@ -331,7 +331,7 @@ s:creator: doc: | - Single-cell Assign Cell Types - ============================= + Deprecated. Single-cell Assign Cell Types + ========================================= Assigns cell types to Seurat clusters. \ No newline at end of file diff --git a/workflows/sc_diff_expr.cwl b/workflows/sc_diff_expr.cwl index 03fb4f06..684f5f82 100644 --- a/workflows/sc_diff_expr.cwl +++ b/workflows/sc_diff_expr.cwl @@ -375,8 +375,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell Differential Expression" -s:name: "Single-cell Differential Expression" +label: "Deprecated. Single-cell Differential Expression" +s:name: "Deprecated. Single-cell Differential Expression" s:alternateName: "Runs differential expression analysis for a subset of cells between two selected conditions" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/sc_diff_expr.cwl @@ -415,7 +415,7 @@ s:creator: doc: | - Single-cell Differential Expression - =================================== + Deprecated. Single-cell Differential Expression + =============================================== Runs differential expression analysis for a subset of cells between two selected conditions. \ No newline at end of file diff --git a/workflows/seurat-cluster.cwl b/workflows/seurat-cluster.cwl index 8c2009a9..435a59ae 100644 --- a/workflows/seurat-cluster.cwl +++ b/workflows/seurat-cluster.cwl @@ -1512,8 +1512,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -s:name: "Seurat Cluster" -label: "Seurat Cluster" +s:name: "Deprecated. Seurat Cluster" +label: "Deprecated. Seurat Cluster" s:alternateName: "Runs filtering, integration, and clustering analyses for Cell Ranger Count Gene Expression or Cell Ranger Aggregate experiments" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/seurat-cluster.cwl @@ -1552,8 +1552,8 @@ s:creator: doc: | - Seurat Cluster - ============== + Deprecated. Seurat Cluster + ========================== Runs filtering, integration, and clustering analyses for Cell Ranger Count Gene Expression or Cell Ranger Aggregate experiments. \ No newline at end of file diff --git a/workflows/single-cell-preprocess.cwl b/workflows/single-cell-preprocess.cwl index 3cc3c6da..d4392232 100644 --- a/workflows/single-cell-preprocess.cwl +++ b/workflows/single-cell-preprocess.cwl @@ -370,8 +370,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -s:name: "Single-Cell Preprocessing Pipeline" -label: "Single-Cell Preprocessing Pipeline" +s:name: "Deprecated. Single-Cell Preprocessing Pipeline" +label: "Deprecated. Single-Cell Preprocessing Pipeline" s:alternateName: "Single-Cell Preprocessing Pipeline" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/single-cell-preprocess.cwl From 7c21e23d1ecdf0ff2841c329bb3fe1a37a7d7800 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 23 May 2023 14:31:08 -0400 Subject: [PATCH 031/162] Update Single-Cell workflows to the latests versions --- tools/cellbrowser-build-cellranger-atac.cwl | 253 +++++ tools/cellranger-arc-mkref.cwl | 7 + tools/cellranger-atac-aggr.cwl | 368 +++++++ tools/cellranger-atac-count.cwl | 374 +++++++ tools/cellranger-mkref.cwl | 7 + tools/collect-stats-sc-arc-count.cwl | 2 +- tools/collect-stats-sc-atac-count.cwl | 116 +++ tools/collect-stats-sc-count.cwl | 2 +- ...deeptools-computematrix-referencepoint.cwl | 499 +++++++++ tools/deeptools-plotheatmap.cwl | 663 ++++++++++++ tools/sc-atac-cluster.cwl | 18 +- tools/sc-atac-coverage.cwl | 274 +++++ tools/sc-atac-dbinding.cwl | 710 +++++++++++++ tools/sc-atac-reduce.cwl | 137 ++- tools/sc-ctype-assign.cwl | 11 +- tools/sc-multiome-filter.cwl | 292 +++++- tools/sc-rna-cluster.cwl | 33 +- tools/sc-rna-da-cells.cwl | 19 +- tools/sc-rna-de-pseudobulk.cwl | 564 ++++++----- tools/sc-rna-filter.cwl | 91 +- tools/sc-rna-reduce.cwl | 56 +- tools/sc-split-atac.cwl | 147 +++ tools/sc-triangulate.cwl | 4 +- tools/sc-wnn-cluster.cwl | 40 +- workflows/cellranger-atac-aggr.cwl | 362 +++++++ workflows/cellranger-atac-count.cwl | 521 ++++++++++ workflows/cellranger-mkref.cwl | 11 +- workflows/sc-atac-cluster.cwl | 3 +- workflows/sc-atac-coverage.cwl | 337 +++++++ workflows/sc-atac-dbinding.cwl | 952 ++++++++++++++++++ workflows/sc-atac-reduce.cwl | 85 +- workflows/sc-ctype-assign.cwl | 2 +- workflows/sc-format-transform.cwl | 133 +++ workflows/sc-multiome-filter.cwl | 207 +++- workflows/sc-rna-cluster.cwl | 12 +- workflows/sc-rna-da-cells.cwl | 3 +- workflows/sc-rna-de-pseudobulk.cwl | 546 +++++----- workflows/sc-rna-filter.cwl | 70 +- workflows/sc-rna-reduce.cwl | 5 +- workflows/sc-triangulate.cwl | 3 +- workflows/sc-wnn-cluster.cwl | 16 +- 41 files changed, 7308 insertions(+), 647 deletions(-) create mode 100644 tools/cellbrowser-build-cellranger-atac.cwl create mode 100644 tools/cellranger-atac-aggr.cwl create mode 100644 tools/cellranger-atac-count.cwl create mode 100644 tools/collect-stats-sc-atac-count.cwl create mode 100644 tools/deeptools-computematrix-referencepoint.cwl create mode 100644 tools/deeptools-plotheatmap.cwl create mode 100644 tools/sc-atac-coverage.cwl create mode 100644 tools/sc-atac-dbinding.cwl create mode 100644 tools/sc-split-atac.cwl create mode 100644 workflows/cellranger-atac-aggr.cwl create mode 100644 workflows/cellranger-atac-count.cwl create mode 100644 workflows/sc-atac-coverage.cwl create mode 100644 workflows/sc-atac-dbinding.cwl create mode 100644 workflows/sc-format-transform.cwl diff --git a/tools/cellbrowser-build-cellranger-atac.cwl b/tools/cellbrowser-build-cellranger-atac.cwl new file mode 100644 index 00000000..9f716d9d --- /dev/null +++ b/tools/cellbrowser-build-cellranger-atac.cwl @@ -0,0 +1,253 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/cellbrowser:v0.0.2 + + +requirements: +- class: InlineJavascriptRequirement +- class: InitialWorkDirRequirement + listing: + - entryname: cellbrowser.conf + entry: | + name = "ATAC" + shortLabel="ATAC" + priority = 1 + geneIdType="auto" + geneLabel="Feature" + exprMatrix="exprMatrix.tsv.gz" + meta="meta.csv" + coords=[ + { + "file": "tsne.coords.csv", + "shortLabel": "t-SNE" + }, + { + "file": "umap.coords.csv", + "shortLabel": "UMAP" + }, + { + "file": "lsa.coords.csv", + "shortLabel": "LSA" + } + ] + markers=[ + { + "file":"markers.tsv", + "shortLabel":"Cluster-specific peaks" + } + ] + enumFields = ["Barcode"] + clusterField="Cluster" + labelField="Cluster" + - entryname: desc.conf + entry: | + title = "ATAC" + abstract = "" + methods = "" + biorxiv_url = "" + custom = {} + + +inputs: + + bash_script: + type: string? + default: | + #!/bin/bash + echo "Prepare input data" + mkdir -p ./cellbrowser_input/analysis/clustering/graphclust \ + ./cellbrowser_input/analysis/diffexp/graphclust \ + ./cellbrowser_input/filtered_feature_bc_matrix + + cp -r $0/clustering/graphclust/clusters.csv ./cellbrowser_input/analysis/clustering/graphclust/clusters.csv + cp -r $0/enrichment/graphclust/differential_expression.csv ./cellbrowser_input/analysis/diffexp/graphclust/differential_expression.csv + cp -r $0/tsne ./cellbrowser_input/analysis/ + cp -r $0/umap ./cellbrowser_input/analysis/ + cp -r $0/lsa ./cellbrowser_input/analysis/ + + cp -r $1/* ./cellbrowser_input/filtered_feature_bc_matrix/ + cd ./cellbrowser_input/filtered_feature_bc_matrix/ + gzip barcodes.tsv + gzip matrix.mtx + cat peaks.bed | awk '{print $1":"$2"-"$3"\t"$1":"$2"-"$3"\tPeaks\t"$0}' > features.tsv + gzip features.tsv + rm -f peaks.bed + cd - + + echo "Run cbImportCellranger" + cbImportCellranger -i cellbrowser_input -o cellbrowser_output --name cellbrowser + cd ./cellbrowser_output + echo "Copying coordinates files" + cp ../cellbrowser_input/analysis/tsne/*/projection.csv tsne.coords.csv + cp ../cellbrowser_input/analysis/umap/*/projection.csv umap.coords.csv + cp ../cellbrowser_input/analysis/lsa/*/projection.csv lsa.coords.csv + + echo "Replace configuration files" + rm -f cellbrowser.conf desc.conf + cp ../cellbrowser.conf . + cp ../desc.conf . + if [[ -n $2 ]]; then + echo "Aggregation metadata file was provided. Adding initial cell identity classes" + cat $2 | grep -v "library_id" | awk '{print NR","$0}' > aggregation_metadata.csv + cat meta.csv | grep -v "Barcode" > meta_headerless.csv + echo "Barcode,Cluster,Identity" > meta.csv + awk -F, 'NR==FNR {identity[$1]=$2; next} {split($1,barcode,"-"); print $0","identity[barcode[2]]}' aggregation_metadata.csv meta_headerless.csv >> meta.csv + rm -f aggregation_metadata.csv meta_headerless.csv + fi + echo "Run cbBuild" + cbBuild -o html_data + inputBinding: + position: 5 + doc: | + Bash script to run cbImportCellranger and cbBuild commands + + secondary_analysis_report_folder: + type: Directory + inputBinding: + position: 6 + doc: | + Folder with secondary analysis results + + filtered_feature_bc_matrix_folder: + type: Directory + inputBinding: + position: 7 + doc: | + Folder with filtered peak-barcode matrices containing only + cellular barcodes in MEX format + + aggregation_metadata: + type: File? + inputBinding: + position: 8 + doc: | + Cellranger aggregation CSV file. If provided, the Identity metadata + column will be added to the meta.csv + + +outputs: + + html_data: + type: Directory + outputBinding: + glob: "cellbrowser_output/html_data" + + index_html_file: + type: File + outputBinding: + glob: "cellbrowser_output/html_data/index.html" + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["bash", "-c"] + + +stdout: cbbuild_stdout.log +stderr: cbbuild_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Cell Ranger ATAC Count/Aggregate to UCSC Cell Browser" +s:name: "Cell Ranger ATAC Count/Aggregate to UCSC Cell Browser" +s:alternateName: | + Exports clustering results from Cell Ranger ATAC Count or Cell Ranger ATAC Aggregate + experiments into compatible with UCSC Cell Browser format + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellbrowser-build-cellranger-atac.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Cell Ranger ATAC Count/Aggregate to UCSC Cell Browser + + Exports clustering results from Cell Ranger ATAC Count + or Cell Ranger ATAC Aggregate experiments into compatible + with UCSC Cell Browser format + + +s:about: | + Usage: cbImportCellranger [options] -i cellRangerDir -o outputDir - convert the cellranger output to cellbrowser format and create a cellranger.conf file + Options: + -h, --help show this help message and exit + -d, --debug show debug messages + -i INDIR, --inDir=INDIR + input folder with the cellranger analysis output. This + is the directory with the two directories 'analysis' + and 'filtered_gene_bc_matrices' + -o OUTDIR, --outDir=OUTDIR + output directory + -n DATASETNAME, --name=DATASETNAME + name of the dataset. No spaces or special characters. + -m, --noMat do not export the matrix again, saves some time if you + changed something small since the last run + + + Usage: cbBuild [options] -i cellbrowser.conf -o outputDir - add a dataset to the single cell viewer directory + If you have previously built into the same output directory with the same dataset and the + expression matrix has not changed its filesize, this will be detected and the expression + matrix will not be copied again. This means that an update of a few meta data attributes + is quite quick. + Options: + -h, --help show this help message and exit + --init copy sample cellbrowser.conf and desc.conf to current + directory + -d, --debug show debug messages + -i INCONF, --inConf=INCONF + a cellbrowser.conf file that specifies labels and all + input files, default is ./cellbrowser.conf, can be + specified multiple times + -o OUTDIR, --outDir=OUTDIR + output directory, default can be set through the env. + variable CBOUT or ~/.cellbrowser.conf, current value: + none + -p PORT, --port=PORT if build is successful, start an http server on this + port and serve the result via http://localhost:port + -r, --recursive run in all subdirectories of the current directory. + Useful when rebuilding a full hierarchy. + --redo=REDO do not use cached old data. Can be: 'meta' or 'matrix' + (matrix includes meta). \ No newline at end of file diff --git a/tools/cellranger-arc-mkref.cwl b/tools/cellranger-arc-mkref.cwl index d5406394..2e50840e 100644 --- a/tools/cellranger-arc-mkref.cwl +++ b/tools/cellranger-arc-mkref.cwl @@ -96,6 +96,13 @@ outputs: Compatible with Cell Ranger ARC reference folder that includes STAR and BWA indices + chrom_length_file: + type: File + outputBinding: + glob: $(get_output_folder_name() + "/star/chrNameLength.txt") + doc: | + Chromosome length file in TSV format + stdout_log: type: stdout diff --git a/tools/cellranger-atac-aggr.cwl b/tools/cellranger-atac-aggr.cwl new file mode 100644 index 00000000..4a3c275e --- /dev/null +++ b/tools/cellranger-atac-aggr.cwl @@ -0,0 +1,368 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement + expressionLib: + - var get_label = function(i) { + var rootname = inputs.barcode_metrics_report[i].basename.split('.').slice(0,-1).join('.'); + rootname = (rootname=="")?inputs.barcode_metrics_report[i].basename:rootname; + return inputs.gem_well_labels?inputs.gem_well_labels[i].replace(/\t|\s|\[|\]|\>|\<|,|\./g, "_"):rootname; + }; +- class: InitialWorkDirRequirement + listing: | + ${ + var entry = "library_id,fragments,cells\n" + for (var i=0; i < inputs.barcode_metrics_report.length; i++){ + entry += get_label(i) + "," + inputs.fragments_file_from_count[i].path + "," + inputs.barcode_metrics_report[i].path + "\n"; + } + return [{ + "entry": entry, + "entryname": "metadata.csv" + }]; + } + + +hints: +- class: DockerRequirement + dockerPull: cumulusprod/cellranger-atac:2.1.0 + + +inputs: + + fragments_file_from_count: + type: File[] + secondaryFiles: + - .tbi + doc: | + Array of files containing count and barcode information for + every ATAC fragment observed in the "cellranger-atac count" + experiment in TSV format. + + barcode_metrics_report: + type: File[] + doc: | + Array of files with per-barcode fragment counts & metrics + produced by "cellranger-atac count" command in CSV format + + gem_well_labels: + type: + - "null" + - string[] + doc: | + Array of GEM well identifiers to be used for labeling purposes only. + If not provided use rootnames of files from the barcode_metrics_report + input + + indices_folder: + type: Directory + inputBinding: + position: 5 + prefix: "--reference" + doc: | + Path to folder containing a Cell Ranger ATAC or Cell Ranger + ARC reference. Should be generated by "cellranger-atac mkref" + or "cellranger-arc mkref" commands + + normalization_mode: + type: + - "null" + - type: enum + name: "normalization" + symbols: ["none", "depth"] + inputBinding: + position: 6 + prefix: "--normalize" + doc: | + Library depth normalization mode: depth, none. + Default: depth + + threads: + type: int? + inputBinding: + position: 7 + prefix: "--localcores" + doc: | + Set max cores the pipeline may request at one time. + Default: all available + + memory_limit: + type: int? + inputBinding: + position: 8 + prefix: "--localmem" + doc: | + Set max GB the pipeline may request at one time + Default: all available + + virt_memory_limit: + type: int? + inputBinding: + position: 9 + prefix: "--localvmem" + doc: | + Set max virtual address space in GB for the pipeline + Default: all available + + +outputs: + + web_summary_report: + type: File + outputBinding: + glob: "aggregated/outs/web_summary.html" + doc: | + Run summary metrics and charts in HTML format + + metrics_summary_report_json: + type: File + outputBinding: + glob: "aggregated/outs/summary.json" + doc: | + Run summary metrics in JSON format + + metrics_summary_report_csv: + type: File + outputBinding: + glob: "aggregated/outs/summary.csv" + doc: | + Run summary metrics in CSV format + + barcode_metrics_report: + type: File + outputBinding: + glob: "aggregated/outs/singlecell.csv" + doc: | + Per-barcode fragment counts & metrics in CSV format + + fragments_file: + type: File + outputBinding: + glob: "aggregated/outs/fragments.tsv.gz" + secondaryFiles: + - .tbi + doc: | + Count and barcode information for every ATAC fragment observed + in the aggregated experiment in TSV format + + peaks_bed_file: + type: File + outputBinding: + glob: "aggregated/outs/peaks.bed" + doc: | + Locations of open-chromatin regions identified in the + aggregated experiment (these regions are referred to + as "peaks") + + peak_annotation_file: + type: File + outputBinding: + glob: "aggregated/outs/peak_annotation.tsv" + doc: | + Annotations of peaks based on genomic proximity alone + + secondary_analysis_report_folder: + type: Directory + outputBinding: + glob: "aggregated/outs/analysis" + doc: | + Folder with secondary analysis results + + filtered_feature_bc_matrix_folder: + type: Directory + outputBinding: + glob: "aggregated/outs/filtered_peak_bc_matrix" + doc: | + Folder with aggregated filtered peak-barcode matrices + containing only cellular barcodes in MEX format. + + filtered_feature_bc_matrix_h5: + type: File + outputBinding: + glob: "aggregated/outs/filtered_peak_bc_matrix.h5" + doc: | + Aggregated filtered peak-barcode matrices containing + only cellular barcodes in HDF5 format. + + filtered_tf_bc_matrix_folder: + type: Directory? + outputBinding: + glob: "aggregated/outs/filtered_tf_bc_matrix" + doc: | + Folder with aggregated filtered tf-barcode matrices + containing only cellular barcodes in MEX format. + + filtered_tf_bc_matrix_h5: + type: File? + outputBinding: + glob: "aggregated/outs/filtered_tf_bc_matrix.h5" + doc: | + Aggregated filtered tf-barcode matrices containing + only cellular barcodes in HDF5 format. + + aggregation_metadata: + type: File + outputBinding: + glob: "aggregated/outs/aggregation_csv.csv" + doc: | + Aggregation CSV file + + loupe_browser_track: + type: File + outputBinding: + glob: "aggregated/outs/cloupe.cloupe" + doc: | + Loupe Browser visualization and analysis file + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["cellranger-atac", "aggr", "--disable-ui", "--id", "aggregated", "--csv", "metadata.csv"] + + +stdout: cellranger_atac_aggr_stdout.log +stderr: cellranger_atac_aggr_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Cellranger ATAC Aggregate" +s:name: "Cellranger ATAC Aggregate" +s:alternateName: "Aggregates outputs from multiple runs of Cell Ranger Count Chromatin Accessibility experiments" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-atac-aggr.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Cellranger ATAC Aggregate + + Aggregates outputs from multiple runs of Cell Ranger Count Chromatin + Accessibility experiments + + Parameters set by default: + --disable-ui - no need in any UI when running in Docker container + --id - hardcoded to `aggregated` as we want to return the + content of the outputs folder as separate outputs + + Skipped parameters: + --description + --peaks + --nosecondary + --dim-reduce + --dry + --jobmode + --mempercore + --maxjobs + --jobinterval + --overrides + --uiport + --noexit + --nopreflight + + +s:about: | + USAGE: + cellranger-atac aggr [OPTIONS] --id --csv --reference + + OPTIONS: + --id + A unique run id and output folder name [a-zA-Z0-9_-]+ of maximum length 64 characters + --description + Sample description to embed in output files + [default: ] + --csv + Path to CSV file enumerating `cellranger-atac count` outputs. + For example, a CSV for aggregating two samples would look as follows (blank lines are ignored): + library_id,fragments,cells + L1,/data/L1/outs/fragments.tsv.gz,/data/L1/outs/singlecell.csv + L2,/data/L2/outs/fragments.tsv.gz,/data/L2/outs/singlecell.csv + Optionally, metadata associated with these libraries can be specified using additional columns. This information is not used by the pipeline but will be available in + the Loupe file for visualization. + --reference + Path to folder containing a Cell Ranger ATAC or Cell Ranger ARC reference + --peaks + Override peak caller: specify peaks to use in downstream analyses from supplied 3-column BED file. The supplied peaks file must be sorted by position and not contain + overlapping peaks; comment lines beginning with `#` are allowed + --normalize + Library depth normalization mode + [default: depth] + [possible values: depth, none] + --nosecondary + Disable secondary analysis, e.g. clustering + --dim-reduce + Dimensionality reduction mode for clustering + [default: lsa] + [possible values: lsa, pca, plsa] + --dry + Do not execute the pipeline. Generate a pipeline invocation (.mro) file and stop + --jobmode + Job manager to use. Valid options: local (default), sge, lsf, slurm or path to a .template file. Search for help on "Cluster Mode" at support.10xgenomics.com for more + details on configuring the pipeline to use a compute cluster + [default: local] + --localcores + Set max cores the pipeline may request at one time. Only applies to local jobs + --localmem + Set max GB the pipeline may request at one time. Only applies to local jobs + --localvmem + Set max virtual address space in GB for the pipeline. Only applies to local jobs + --mempercore + Reserve enough threads for each job to ensure enough memory will be available, assuming each core on your cluster has at least this much memory available. Only applies + to cluster jobmodes + --maxjobs + Set max jobs submitted to cluster at one time. Only applies to cluster jobmodes + --jobinterval + Set delay between submitting jobs to cluster, in ms. Only applies to cluster jobmodes + --overrides + The path to a JSON file that specifies stage-level overrides for cores and memory. Finer-grained than --localcores, --mempercore and --localmem. Consult + https://support.10xgenomics.com/ for an example override file + --uiport + Serve web UI at http://localhost:PORT + --disable-ui + Do not serve the web UI + --noexit + Keep web UI running after pipestance completes or fails + --nopreflight + Skip preflight checks + --help + Print help information \ No newline at end of file diff --git a/tools/cellranger-atac-count.cwl b/tools/cellranger-atac-count.cwl new file mode 100644 index 00000000..0125b5ba --- /dev/null +++ b/tools/cellranger-atac-count.cwl @@ -0,0 +1,374 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement +- class: InitialWorkDirRequirement + listing: | + ${ + var listing = [ + { + "entry": inputs.fastq_file_r1, + "entryname": "sample_S1_L001_R1_001.fastq", + "writable": true + }, + { + "entry": inputs.fastq_file_r2, + "entryname": "sample_S1_L001_R2_001.fastq", + "writable": true + }, + { + "entry": inputs.fastq_file_r3, + "entryname": "sample_S1_L001_R3_001.fastq", + "writable": true + } + ]; + if (inputs.fastq_file_i1){ + listing.push( + { + "entry": inputs.fastq_file_i1, + "entryname": "sample_S1_L001_I1_001.fastq", + "writable": true + } + ); + }; + return listing; + } + + +hints: +- class: DockerRequirement + dockerPull: cumulusprod/cellranger-atac:2.1.0 + + +inputs: + + fastq_file_r1: + type: File + doc: | + FASTQ read 1 file (will be staged into workdir as sample_S1_L001_R1_001.fastq) + + fastq_file_r2: + type: File + doc: | + FASTQ read 2 file (will be staged into workdir as sample_S1_L001_R2_001.fastq) + + fastq_file_r3: + type: File + doc: | + FASTQ read 3 file (will be staged into workdir as sample_S1_L001_R3_001.fastq) + + fastq_file_i1: + type: File? + doc: | + FASTQ index file (if provided, will be staged into workdir as sample_S1_L001_I1_001.fastq) + + indices_folder: + type: Directory + inputBinding: + position: 10 + prefix: "--reference" + doc: | + Path to folder containing a Cell Ranger ATAC or Cell Ranger + ARC reference. Should be generated by "cellranger-atac mkref" + or "cellranger-arc mkref" commands + + force_cells: + type: int? + inputBinding: + position: 11 + prefix: "--force-cells" + doc: | + Define the top N barcodes with the most fragments overlapping + peaks as cells. N must be a positive integer <= 20,000. Please + consult the documentation before using this option + + threads: + type: int? + inputBinding: + position: 12 + prefix: "--localcores" + doc: | + Set max cores the pipeline may request at one time. + Default: all available + + memory_limit: + type: int? + inputBinding: + position: 13 + prefix: "--localmem" + doc: | + Set max GB the pipeline may request at one time + Default: all available + + virt_memory_limit: + type: int? + inputBinding: + position: 14 + prefix: "--localvmem" + doc: | + Set max virtual address space in GB for the pipeline + Default: all available + + +outputs: + + web_summary_report: + type: File + outputBinding: + glob: "sample/outs/web_summary.html" + doc: | + Run summary metrics and charts in HTML format + + metrics_summary_report_json: + type: File + outputBinding: + glob: "sample/outs/summary.json" + doc: | + Run summary metrics in JSON format + + metrics_summary_report_csv: + type: File + outputBinding: + glob: "sample/outs/summary.csv" + doc: | + Run summary metrics in CSV format + + barcode_metrics_report: + type: File + outputBinding: + glob: "sample/outs/singlecell.csv" + doc: | + Per-barcode fragment counts & metrics in CSV format + + possorted_genome_bam_bai: + type: File? + outputBinding: + glob: "sample/outs/possorted_bam.bam" + secondaryFiles: + - .bai + doc: | + Indexed position-sorted reads aligned to the genome annotated + with barcode information in BAM format + + fragments_file: + type: File + outputBinding: + glob: "sample/outs/fragments.tsv.gz" + secondaryFiles: + - .tbi + doc: | + Count and barcode information for every ATAC fragment observed + in the experiment in TSV format + + peaks_bed_file: + type: File + outputBinding: + glob: "sample/outs/peaks.bed" + doc: | + Locations of open-chromatin regions identified in the + experiment (these regions are referred to as "peaks") + + peak_annotation_file: + type: File + outputBinding: + glob: "sample/outs/peak_annotation.tsv" + doc: | + Annotations of peaks based on genomic proximity alone + + cut_sites_bigwig_file: + type: File + outputBinding: + glob: "sample/outs/cut_sites.bigwig" + doc: | + Smoothed transposition site track in bigWig format + + peak_motif_mapping_bed: + type: File? + outputBinding: + glob: "sample/outs/peak_motif_mapping.bed" + doc: | + File with peak-motif associations in BED format + + filtered_feature_bc_matrix_folder: + type: Directory + outputBinding: + glob: "sample/outs/filtered_peak_bc_matrix" + doc: | + Folder with filtered peak-barcode matrices containing only cellular barcodes in MEX format. + + filtered_feature_bc_matrix_h5: + type: File + outputBinding: + glob: "sample/outs/filtered_peak_bc_matrix.h5" + doc: | + Filtered peak-barcode matrices containing only cellular barcodes in HDF5 format. + + filtered_tf_bc_matrix_folder: + type: Directory? + outputBinding: + glob: "sample/outs/filtered_tf_bc_matrix" + doc: | + Folder with filtered tf-barcode matrices containing only cellular barcodes in MEX format. + + filtered_tf_bc_matrix_h5: + type: File? + outputBinding: + glob: "sample/outs/filtered_tf_bc_matrix.h5" + doc: | + Filtered tf-barcode matrices containing only cellular barcodes in HDF5 format. + + raw_feature_bc_matrices_folder: + type: Directory + outputBinding: + glob: "sample/outs/raw_peak_bc_matrix" + doc: | + Folder with unfiltered peak-barcode matrices containing all barcodes in MEX format + + raw_feature_bc_matrices_h5: + type: File + outputBinding: + glob: "sample/outs/raw_peak_bc_matrix.h5" + doc: | + Unfiltered peak-barcode matrices containing all barcodes in HDF5 format + + secondary_analysis_report_folder: + type: Directory + outputBinding: + glob: "sample/outs/analysis" + doc: | + Folder with secondary analysis results + + loupe_browser_track: + type: File + outputBinding: + glob: "sample/outs/cloupe.cloupe" + doc: | + Loupe Browser visualization and analysis file + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["cellranger-atac", "count", "--disable-ui", "--fastqs", ".", "--sample", "sample", "--id", "sample"] + + +stdout: cellranger_atac_count_stdout.log +stderr: cellranger_atac_count_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Cell Ranger ATAC Count" +s:name: "Cell Ranger ATAC Count" +s:alternateName: "Counts reads from a single scATAC-Seq library" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-atac-count.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Cell Ranger ATAC Count + + Counts reads from a single scATAC-Seq library. + + Parameters set by default: + --disable-ui - no need in any UI when running in Docker container + --id - can be hardcoded as we rename input files anyway + --fastqs - points to the current directory, because input + FASTQ files are staged there + --sample - hardcoded to sample as we stage input fastq files + with the hardcoded names + + Not implemented parameters: + --description - not needed for now + --project - no needed to select input files by folder + --lanes - not needed for now + --peaks - not needed for now + --dim-reduce - not needed for now + --subsample-rate - not needed for now + --dry - not applicable to our use case + --jobmode - we use default local mode + --mempercore - not used for local mode + --maxjobs - not used for local mode + --jobinterval - not used for local mode + --overrides - not needed for now + --uiport - we disabled UI + --noexit - we disabled UI + --nopreflight - no reason to skip preflight checks + + +s:about: | + USAGE: + cellranger-atac count [OPTIONS] --id --reference --fastqs + + OPTIONS: + --id A unique run id and output folder name [a-zA-Z0-9_-]+ of maximum length 64 characters + --description Sample description to embed in output files [default: ] + --reference Path to folder containing a Cell Ranger ATAC or Cell Ranger ARC reference + --fastqs Path to input FASTQ data + --project Name of the project folder within a mkfastq or bcl2fastq-generated folder to pick FASTQs from + --sample Prefix of the filenames of FASTQs to select + --lanes Only use FASTQs from selected lanes + --force-cells Define the top N barcodes with the most fragments overlapping peaks as cells. N must be a positive integer <= 20,000. Please consult the + documentation before using this option + --peaks Override peak caller: specify peaks to use in downstream analyses from supplied 3-column BED file. The supplied peaks file must be sorted by + position and not contain overlapping peaks; comment lines beginning with `#` are allowed + --dim-reduce Dimensionality reduction mode for clustering [default: lsa] [possible values: lsa, pca, plsa] + --subsample-rate Downsample to preserve this fraction of reads + --dry Do not execute the pipeline. Generate a pipeline invocation (.mro) file and stop + --jobmode Job manager to use. Valid options: local (default), sge, lsf, slurm or path to a .template file. Search for help on "Cluster Mode" at + support.10xgenomics.com for more details on configuring the pipeline to use a compute cluster [default: local] + --localcores Set max cores the pipeline may request at one time. Only applies to local jobs + --localmem Set max GB the pipeline may request at one time. Only applies to local jobs + --localvmem Set max virtual address space in GB for the pipeline. Only applies to local jobs + --mempercore Reserve enough threads for each job to ensure enough memory will be available, assuming each core on your cluster has at least this much + memory available. Only applies to cluster jobmodes + --maxjobs Set max jobs submitted to cluster at one time. Only applies to cluster jobmodes + --jobinterval Set delay between submitting jobs to cluster, in ms. Only applies to cluster jobmodes + --overrides The path to a JSON file that specifies stage-level overrides for cores and memory. Finer-grained than --localcores, --mempercore and + --localmem. Consult https://support.10xgenomics.com/ for an example override file + --uiport Serve web UI at http://localhost:PORT + --disable-ui Do not serve the web UI + --noexit Keep web UI running after pipestance completes or fails + --nopreflight Skip preflight checks + --help Print help information \ No newline at end of file diff --git a/tools/cellranger-mkref.cwl b/tools/cellranger-mkref.cwl index 96a90206..6ded927b 100644 --- a/tools/cellranger-mkref.cwl +++ b/tools/cellranger-mkref.cwl @@ -76,6 +76,13 @@ outputs: Cellranger-compatible reference folder that includes STAR indices and some additional files + chrom_length_file: + type: File + outputBinding: + glob: $(get_output_folder_name() + "/star/chrNameLength.txt") + doc: | + Chromosome length file in TSV format + stdout_log: type: stdout diff --git a/tools/collect-stats-sc-arc-count.cwl b/tools/collect-stats-sc-arc-count.cwl index e38106f5..63fcd245 100644 --- a/tools/collect-stats-sc-arc-count.cwl +++ b/tools/collect-stats-sc-arc-count.cwl @@ -18,7 +18,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/scstats:v0.0.1 + dockerPull: biowardrobe2/scstats:v0.0.2 inputs: diff --git a/tools/collect-stats-sc-atac-count.cwl b/tools/collect-stats-sc-atac-count.cwl new file mode 100644 index 00000000..4ada8d1f --- /dev/null +++ b/tools/collect-stats-sc-atac-count.cwl @@ -0,0 +1,116 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: ShellCommandRequirement +- class: InlineJavascriptRequirement + expressionLib: + - var get_output_prefix = function() { + if (inputs.output_prefix) { + return inputs.output_prefix; + } + var root = inputs.metrics_summary_report.basename.split('.').slice(0,-1).join('.'); + var suffix = "_stats"; + return (root == "")?inputs.metrics_summary_report.basename+suffix:root+suffix; + }; + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/scstats:v0.0.2 + + +inputs: + + metrics_summary_report: + type: File + inputBinding: + position: 6 + prefix: "--metrics" + + output_prefix: + type: string? + inputBinding: + position: 7 + prefix: "--output" + valueFrom: $(get_output_prefix()) + default: "" + + +outputs: + + collected_statistics_yaml: + type: File + outputBinding: + glob: $(get_output_prefix()+".yaml") + + collected_statistics_tsv: + type: File + outputBinding: + glob: $(get_output_prefix()+".tsv") + + collected_statistics_md: + type: File + outputBinding: + glob: $(get_output_prefix()+".md") + + +baseCommand: ["cell_ranger_atac_count_stats.py"] + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +s:name: "Cell Ranger ATAC Count Statistics" +label: "Cell Ranger ATAC Count Statistics" +s:alternateName: "Collects statistics from Cell Ranger ATAC Count experiment" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/collect-stats-sc-atac-count.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Cell Ranger ATAC Count Statistics + ================================ + + Collects statistics from Cell Ranger ATAC Count experiment + + +s:about: | + Collects statistics from Cell Ranger ATAC Count experiment diff --git a/tools/collect-stats-sc-count.cwl b/tools/collect-stats-sc-count.cwl index ac85994e..504fda74 100644 --- a/tools/collect-stats-sc-count.cwl +++ b/tools/collect-stats-sc-count.cwl @@ -18,7 +18,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/scstats:v0.0.1 + dockerPull: biowardrobe2/scstats:v0.0.2 inputs: diff --git a/tools/deeptools-computematrix-referencepoint.cwl b/tools/deeptools-computematrix-referencepoint.cwl new file mode 100644 index 00000000..5ec18528 --- /dev/null +++ b/tools/deeptools-computematrix-referencepoint.cwl @@ -0,0 +1,499 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/deeptools:v0.0.1 + + +inputs: + + score_files: + type: + - File + - File[] + inputBinding: + position: 5 + prefix: "--scoreFileName" + doc: | + BigWig file(s) containing the scores to be plotted + + regions_files: + type: + - File + - File[] + inputBinding: + position: 6 + prefix: "--regionsFileName" + doc: | + File name or names, in BED format, containing the regions to plot + + reference_point: + type: + - "null" + - type: enum + name: "reference" + symbols: ["TSS", "TES", "center"] + inputBinding: + position: 7 + prefix: "--referencePoint" + doc: | + The reference point for the plotting could be either the region start (TSS), + the region end (TES) or the center of the region. Note that regardless of what + you specify, plotHeatmap/plotProfile will default to using “TSS” as the label. + Default: TSS + + before_region_start_length: + type: int? + inputBinding: + position: 8 + prefix: "--beforeRegionStartLength" + doc: | + Distance upstream of the reference-point selected. + Default: 500 + + after_region_start_length: + type: int? + inputBinding: + position: 9 + prefix: "--afterRegionStartLength" + doc: | + Distance downstream of the reference-point selected. + Default: 1500 + + nan_after_end: + type: boolean? + inputBinding: + position: 10 + prefix: "--nanAfterEnd" + doc: | + If set, any values after the region end are discarded. This is useful to visualize + the region end when not using the scale-regions mode and when the reference-point + is set to the TSS. + + bin_size: + type: int? + inputBinding: + position: 11 + prefix: "--binSize" + doc: | + Length, in bases, of the non-overlapping bins for averaging the score over + the regions length. + Default: 10 + + sort_regions: + type: + - "null" + - type: enum + name: "sort" + symbols: ["descend", "ascend", "no", "keep"] + inputBinding: + position: 12 + prefix: "--sortRegions" + doc: | + Whether the output file should present the regions sorted. The default is to + not sort the regions. Note that this is only useful if you plan to plot the + results yourself and not, for example, with plotHeatmap, which will override this. + Note also that unsorted output will be in whatever order the regions happen to + be processed in and not match the order in the input files. If you require the + output order to match that of the input regions, then either specify “keep” or + use computeMatrixOperations to resort the results file. + Default: keep + + sort_using: + type: + - "null" + - type: enum + name: "sort_type" + symbols: ["mean", "median", "max", "min", "sum", "region_length"] + inputBinding: + position: 13 + prefix: "--sortUsing" + doc: | + Indicate which method should be used for sorting. The value is computed for + each row. Note that the region_length option will lead to a dotted line + within the heatmap that indicates the end of the regions. + Default: mean + + average_type_bins: + type: + - "null" + - type: enum + name: "average" + symbols: ["mean", "median", "min", "max", "std", "sum"] + inputBinding: + position: 14 + prefix: "--averageTypeBins" + doc: | + Define the type of statistic that should be used over the bin size range. + The options are: “mean”, “median”, “min”, “max”, “sum” and “std”. + Default: mean + + missing_data_as_zero: + type: boolean? + inputBinding: + position: 15 + prefix: "--missingDataAsZero" + doc: | + If set, missing data (NAs) will be treated as zeros. The default is to ignore such cases, + which will be depicted as black areas in a heatmap. (see the –missingDataColor argument + of the plotHeatmap command for additional options) + + skip_zeros: + type: boolean? + inputBinding: + position: 16 + prefix: "--skipZeros" + doc: | + Whether regions with only scores of zero should be included or not. + Default is to include them + + min_threshold: + type: float? + inputBinding: + position: 17 + prefix: "--minThreshold" + doc: | + Numeric value. Any region containing a value that is less than or equal to this will be skipped. + This is useful to skip, for example, genes where the read count is zero for any of the bins. + This could be the result of unmappable areas and can bias the overall results. + Default: None + + max_threshold: + type: float? + inputBinding: + position: 18 + prefix: "--maxThreshold" + doc: | + Numeric value. Any region containing a value greater than or equal to this will be skipped. + The maxThreshold is useful to skip those few regions with very high read counts (e.g. micro satellites) + that may bias the average values. + Default: None + + samples_label: + type: + - "null" + - string + - string[] + inputBinding: + position: 19 + prefix: "--samplesLabel" + doc: | + Labels for the samples. This will then be passed to plotHeatmap and plotProfile. + The default is to use the file name of the sample. The sample labels should be + separated by spaces and quoted if a label itselfcontains a space + E.g. –samplesLabel label-1 “label 2” + + blacklisted_regions: + type: File? + inputBinding: + position: 20 + prefix: "--blackListFileName" + doc: | + A BED file containing regions that should be excluded from all analyses. Currently + this works by rejecting genomic chunks that happen to overlap an entry. Consequently, + for BAM files, if a read partially overlaps a blacklisted region or a fragment spans + over it, then the read/fragment might still be considered + + output_filename: + type: string + inputBinding: + position: 21 + prefix: "--outFileName" + doc: | + File name to save the gzipped matrix file needed by the “plotHeatmap” and “plotProfile” tools + + threads: + type: int? + inputBinding: + position: 22 + prefix: "--numberOfProcessors" + doc: | + Number of processors to use + + +outputs: + + scores_matrix: + type: File + outputBinding: + glob: $(inputs.output_filename) + doc: | + Scores per genome regions matrix, + File that can be used with plotHeatmap and plotProfiles + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["computeMatrix", "reference-point", "--verbose"] + + +stdout: compute_matrix_stdout.log +stderr: compute_matrix_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +s:mainEntity: + $import: ./metadata/deeptools-metadata.yaml + +label: "computeMatrix - prepares an intermediate file that can be used with plotHeatmap and plotProfiles" +s:name: "computeMatrix - prepares an intermediate file that can be used with plotHeatmap and plotProfiles" +s:alternateName: "computeMatrix - prepares an intermediate file that can be used with plotHeatmap and plotProfiles" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/deeptools-computematrix-referencepoint.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Tool calculates scores per genome regions and prepares an intermediate file that can be used + with plotHeatmap and plotProfiles. Typically, the genome regions are genes, but any other + regions defined in a BED file can be used. computeMatrix accepts multiple score files + (bigWig format) and multiple regions files (BED format). This tool can also be used to filter + and sort regions according to their score. + + +s:about: | + usage: An example usage is: + computeMatrix reference-point -S -R -a 3000 -b 3000 + + optional arguments: + -h, --help show this help message and exit + + Required arguments: + --regionsFileName File [File ...], -R File [File ...] + File name or names, in BED or GTF format, containing + the regions to plot. If multiple bed files are given, + each one is considered a group that can be plotted + separately. Also, adding a "#" symbol in the bed file + causes all the regions until the previous "#" to be + considered one group. (default: None) + --scoreFileName File [File ...], -S File [File ...] + bigWig file(s) containing the scores to be plotted. + Multiple files should be separated by spaced. BigWig + files can be obtained by using the bamCoverage or + bamCompare tools. More information about the bigWig + file format can be found at + http://genome.ucsc.edu/goldenPath/help/bigWig.html + (default: None) + + Output options: + --outFileName OUTFILENAME, -out OUTFILENAME, -o OUTFILENAME + File name to save the gzipped matrix file needed by + the "plotHeatmap" and "plotProfile" tools. (default: + None) + --outFileNameMatrix FILE + If this option is given, then the matrix of values + underlying the heatmap will be saved using the + indicated name, e.g. IndividualValues.tab.This matrix + can easily be loaded into R or other programs. + (default: None) + --outFileSortedRegions BED file + File name in which the regions are saved after skiping + zeros or min/max threshold values. The order of the + regions in the file follows the sorting order + selected. This is useful, for example, to generate + other heatmaps keeping the sorting of the first + heatmap. Example: Heatmap1sortedRegions.bed (default: + None) + + Optional arguments: + --version show program's version number and exit + --referencePoint {TSS,TES,center} + The reference point for the plotting could be either + the region start (TSS), the region end (TES) or the + center of the region. Note that regardless of what you + specify, plotHeatmap/plotProfile will default to using + "TSS" as the label. (Default: TSS) + --beforeRegionStartLength INT bp, -b INT bp, --upstream INT bp + Distance upstream of the reference-point selected. + (Default: 500) + --afterRegionStartLength INT bp, -a INT bp, --downstream INT bp + Distance downstream of the reference-point selected. + (Default: 1500) + --nanAfterEnd If set, any values after the region end are discarded. + This is useful to visualize the region end when not + using the scale-regions mode and when the reference- + point is set to the TSS. (default: False) + --binSize BINSIZE, -bs BINSIZE + Length, in bases, of the non-overlapping bins for + averaging the score over the regions length. (Default: + 10) + --sortRegions {descend,ascend,no,keep} + Whether the output file should present the regions + sorted. The default is to not sort the regions. Note + that this is only useful if you plan to plot the + results yourself and not, for example, with + plotHeatmap, which will override this. Note also that + unsorted output will be in whatever order the regions + happen to be processed in and not match the order in + the input files. If you require the output order to + match that of the input regions, then either specify + "keep" or use computeMatrixOperations to resort the + results file. (Default: keep) + --sortUsing {mean,median,max,min,sum,region_length} + Indicate which method should be used for sorting. The + value is computed for each row.Note that the + region_length option will lead to a dotted line within + the heatmap that indicates the end of the regions. + (Default: mean) + --sortUsingSamples SORTUSINGSAMPLES [SORTUSINGSAMPLES ...] + List of sample numbers (order as in matrix), that are + used for sorting by --sortUsing, no value uses all + samples, example: --sortUsingSamples 1 3 (default: + None) + --averageTypeBins {mean,median,min,max,std,sum} + Define the type of statistic that should be used over + the bin size range. The options are: "mean", "median", + "min", "max", "sum" and "std". The default is "mean". + (Default: mean) + --missingDataAsZero If set, missing data (NAs) will be treated as zeros. + The default is to ignore such cases, which will be + depicted as black areas in a heatmap. (see the + --missingDataColor argument of the plotHeatmap command + for additional options). (default: False) + --skipZeros Whether regions with only scores of zero should be + included or not. Default is to include them. (default: + False) + --minThreshold MINTHRESHOLD + Numeric value. Any region containing a value that is + less than or equal to this will be skipped. This is + useful to skip, for example, genes where the read + count is zero for any of the bins. This could be the + result of unmappable areas and can bias the overall + results. (Default: None) + --maxThreshold MAXTHRESHOLD + Numeric value. Any region containing a value greater + than or equal to this will be skipped. The + maxThreshold is useful to skip those few regions with + very high read counts (e.g. micro satellites) that may + bias the average values. (Default: None) + --blackListFileName BED file, -bl BED file + A BED file containing regions that should be excluded + from all analyses. Currently this works by rejecting + genomic chunks that happen to overlap an entry. + Consequently, for BAM files, if a read partially + overlaps a blacklisted region or a fragment spans over + it, then the read/fragment might still be considered. + (default: None) + --samplesLabel SAMPLESLABEL [SAMPLESLABEL ...] + Labels for the samples. This will then be passed to + plotHeatmap and plotProfile. The default is to use the + file name of the sample. The sample labels should be + separated by spaces and quoted if a label + itselfcontains a space E.g. --samplesLabel label-1 + "label 2" (default: None) + --smartLabels Instead of manually specifying labels for the input + bigWig and BED/GTF files, this causes deepTools to use + the file name after removing the path and extension. + (default: False) + --quiet, -q Set to remove any warning or processing messages. + (default: False) + --verbose Being VERY verbose in the status messages. --quiet + will disable this. (default: False) + --scale SCALE If set, all values are multiplied by this number. + (Default: 1) + --numberOfProcessors INT, -p INT + Number of processors to use. Type "max/2" to use half + the maximum number of processors or "max" to use all + available processors. (Default: 1) + + GTF/BED12 options: + --metagene When either a BED12 or GTF file are used to provide + regions, perform the computation on the merged exons, + rather than using the genomic interval defined by the + 5-prime and 3-prime most transcript bound (i.e., + columns 2 and 3 of a BED file). If a BED3 or BED6 file + is used as input, then columns 2 and 3 are used as an + exon. (Default: False) + --transcriptID TRANSCRIPTID + When a GTF file is used to provide regions, only + entries with this value as their feature (column 3) + will be processed as transcripts. (Default: + transcript) + --exonID EXONID When a GTF file is used to provide regions, only + entries with this value as their feature (column 3) + will be processed as exons. CDS would be another + common value for this. (Default: exon) + --transcript_id_designator TRANSCRIPT_ID_DESIGNATOR + Each region has an ID (e.g., ACTB) assigned to it, + which for BED files is either column 4 (if it exists) + or the interval bounds. For GTF files this is instead + stored in the last column as a key:value pair (e.g., + as 'transcript_id "ACTB"', for a key of transcript_id + and a value of ACTB). In some cases it can be + convenient to use a different identifier. To do so, + set this to the desired key. (Default: transcript_id) + + deepBlue arguments: + Options used only for remote bedgraph/wig files hosted on deepBlue + + --deepBlueURL DEEPBLUEURL + For remote files bedgraph/wiggle files hosted on + deepBlue, this specifies the server URL. The default + is "http://deepblue.mpi-inf.mpg.de/xmlrpc", which + should not be changed without good reason. (default: + http://deepblue.mpi-inf.mpg.de/xmlrpc) + --userKey USERKEY For remote files bedgraph/wiggle files hosted on + deepBlue, this specifies the user key to use for + access. The default is "anonymous_key", which suffices + for public datasets. If you need access to a + restricted access/private dataset, then request a key + from deepBlue and specify it here. (default: + anonymous_key) + --deepBlueTempDir DEEPBLUETEMPDIR + If specified, temporary files from preloading datasets + from deepBlue will be written here (note, this + directory must exist). If not specified, where ever + temporary files would normally be written on your + system is used. (default: None) + --deepBlueKeepTemp If specified, temporary bigWig files from preloading + deepBlue datasets are not deleted. A message will be + printed noting where these files are and what sample + they correspond to. These can then be used if you wish + to analyse the same sample with the same regions + again. (default: False) \ No newline at end of file diff --git a/tools/deeptools-plotheatmap.cwl b/tools/deeptools-plotheatmap.cwl new file mode 100644 index 00000000..72c56f3c --- /dev/null +++ b/tools/deeptools-plotheatmap.cwl @@ -0,0 +1,663 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/deeptools:v0.0.1 + + +inputs: + + scores_matrix: + type: File + inputBinding: + position: 5 + prefix: "--matrixFile" + doc: | + Matrix file from the computeMatrix tool + + output_filename: + type: string + inputBinding: + position: 6 + prefix: "--outFileName" + doc: | + File name to save the image to. The file ending will be used to determine the image format. + The available options are: “png”, “eps”, “pdf” and “svg”, e.g., MyHeatmap.png + + interpolation_method: + type: + - "null" + - type: enum + name: "interpolation_method" + symbols: ["auto", "nearest", "bilinear", "bicubic", "gaussian"] + inputBinding: + position: 7 + prefix: "--interpolationMethod" + doc: | + If the heatmap image contains a large number of columns is usually better to use an + interpolation method to produce better results. By default, plotHeatmap uses the method + nearest if the number of columns is 1000 or less. Otherwise it uses the bilinear method. + This default behaviour can be changed by using any of the following options: “nearest”, + “bilinear”, “bicubic”, “gaussian” + + dpi: + type: int? + inputBinding: + position: 8 + prefix: "--dpi" + doc: "Set the DPI to save the figure." + + plot_type: + type: + - "null" + - type: enum + name: "plot_type" + symbols: ["lines", "fill", "se", "std"] + inputBinding: + position: 9 + prefix: "--plotType" + doc: | + “lines” will plot the profile line based on the average type selected. + “fill” fills the region between zero and the profile curve. The fill in + color is semi transparent to distinguish different profiles. + “se” and “std” color the region between the profile and the standard error + or standard deviation of the data. + + sort_regions: + type: + - "null" + - type: enum + name: "sort_regions" + symbols: ["descend", "ascend", "no", "keep"] + inputBinding: + position: 10 + prefix: "--sortRegions" + doc: | + Whether the heatmap should present the regions sorted. The default is to sort in + descending order based on the mean value per region. Note that “keep” and “no” are + the same thing. + + sort_using: + type: + - "null" + - type: enum + name: "sort_using" + symbols: ["mean", "median", "max", "min", "sum", "region_length"] + inputBinding: + position: 11 + prefix: "--sortUsing" + doc: | + Indicates which method should be used for sorting. For each row the method is computed. + For region_length, a dashed line is drawn at the end of the region (reference point TSS + and center) or the beginning of the region (reference point TES) as appropriate. + + average_type_summary_plot: + type: + - "null" + - type: enum + name: "average_type_summary_plot" + symbols: ["mean", "median", "min", "max", "std", "sum"] + inputBinding: + position: 12 + prefix: "--averageTypeSummaryPlot" + doc: | + Define the type of statistic that should be plotted in the summary image above the heatmap. + The options are: “mean”, “median”, “min”, “max”, “sum” and “std”. + + what_to_show: + type: + - "null" + - type: enum + name: "what_to_show" + symbols: + - plot, heatmap and colorbar + - plot and heatmap + - heatmap only + - heatmap and colorbar + inputBinding: + position: 13 + prefix: "--whatToShow" + doc: | + The default is to include a summary or profile plot on top of the heatmap and a heatmap colorbar. + Other options are: “plot and heatmap”, “heatmap only”, “heatmap and colorbar”, and the default “plot, + heatmap and colorbar”. + + x_axis_label: + type: string? + inputBinding: + position: 14 + prefix: "--xAxisLabel" + doc: | + Description for the x-axis label + + start_label: + type: string? + inputBinding: + position: 15 + prefix: "--startLabel" + doc: | + [only for scale-regions mode] Label shown in the plot for the start of the region. + Default is TSS (transcription start site), but could be changed to anything, e.g. + “peak start”. Same for the –endLabel option. + + end_label: + type: string? + inputBinding: + position: 16 + prefix: "--endLabel" + doc: | + [only for scale-regions mode] Label shown in the plot for the region end. + Default is TES (transcription end site). + + ref_point_label: + type: string? + inputBinding: + position: 17 + prefix: "--refPointLabel" + doc: | + [only for reference-point mode] Label shown in the plot for the reference-point. + Default is the same as the reference point selected (e.g. TSS), but could be anything, + e.g. “peak start”. + + label_rotation_angle: + type: int? + inputBinding: + position: 18 + prefix: "--labelRotation" + doc: | + Rotation of the X-axis labels in degrees. + The default is 0, positive values denote a counter-clockwise rotation. + + regions_label: + type: + - "null" + - string + - string[] + inputBinding: + position: 19 + prefix: "--regionsLabel" + doc: | + Labels for the regions plotted in the heatmap. If more than one region is being plotted, a list of + labels separated by spaces is required. If a label itself contains a space, then quotes are needed. + For example, –regionsLabel label_1, “label 2”. + + samples_label: + type: + - "null" + - string + - string[] + inputBinding: + position: 20 + prefix: "--samplesLabel" + doc: | + Labels for the samples plotted. The default is to use the file name of the sample. The sample labels + should be separated by spaces and quoted if a label itselfcontains a space + E.g. –samplesLabel label-1 “label 2” + + plot_title: + type: string? + inputBinding: + position: 21 + prefix: "--plotTitle" + doc: | + Title of the plot, to be printed on top of the generated image. + Leave blank for no title. + + y_axisLabel: + type: string? + inputBinding: + position: 22 + prefix: "--yAxisLabel" + doc: | + Y-axis label for the top panel. + + y_min: + type: + - "null" + - int + - int[] + inputBinding: + position: 23 + prefix: "--yMin" + doc: | + Minimum value for the Y-axis. Multiple values, separated by spaces can be set for each profile. + If the number of yMin values is smaller thanthe number of plots, the values are recycled. + + y_max: + type: + - "null" + - int + - int[] + inputBinding: + position: 24 + prefix: "--yMax" + doc: | + Maximum value for the Y-axis. Multiple values, separated by spaces can be set for each profile. + If the number of yMin values is smaller thanthe number of plots, the values are recycled. + + legend_location: + type: + - "null" + - type: enum + name: "legend_location" + symbols: + - best + - upper-right + - upper-left + - upper-center + - lower-left + - lower-right + - lower-center + - center + - center-left + - center-right + - none + inputBinding: + position: 25 + prefix: "--legendLocation" + doc: | + Location for the legend in the summary plot. Note that “none” does not work for the profiler. + + per_group: + type: boolean? + inputBinding: + position: 26 + prefix: "--perGroup" + doc: | + The default is to plot all groups of regions by sample. Using this option instead plots all + samples by group of regions. Note that this is only useful if you have multiple groups of + regions by sample rather than group. + + plot_file_format: + type: + - "null" + - type: enum + name: "plot_file_format" + symbols: ["png", "pdf", "svg", "eps", "plotly"] + inputBinding: + position: 27 + prefix: "--plotFileFormat" + doc: | + Image format type. If given, this option overrides the image format based on the plotFile ending. + The available options are: “png”, “eps”, “pdf”, “plotly” and “svg” + + +outputs: + + heatmap_file: + type: File + outputBinding: + glob: $(inputs.output_filename) + doc: "Heatmap file" + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["plotHeatmap", "--verbose"] + + +stdout: plot_heatmap_stdout.log +stderr: plot_heatmap_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +s:mainEntity: + $import: ./metadata/deeptools-metadata.yaml + +label: "plotHeatmap - tool creates a heatmap for scores associated with genomic regions" +s:name: "plotHeatmap - tool creates a heatmap for scores associated with genomic regions" +s:alternateName: "plotHeatmap - tool creates a heatmap for scores associated with genomic regions" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/deeptools-plotheatmap.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + This tool creates a heatmap for scores associated with genomic regions. + The program requires a matrix file generated by the tool computeMatrix. + + +s:about: | + usage: plotHeatmap [--matrixFile MATRIXFILE] --outFileName OUTFILENAME + [--outFileSortedRegions FILE] [--outFileNameMatrix FILE] + [--interpolationMethod STR] [--dpi DPI] [--kmeans KMEANS] + [--hclust HCLUST] [--silhouette] [--help] [--version] + [--plotType {lines,fill,se,std}] + [--sortRegions {descend,ascend,no,keep}] + [--sortUsing {mean,median,max,min,sum,region_length}] + [--sortUsingSamples SORTUSINGSAMPLES [SORTUSINGSAMPLES ...]] + [--linesAtTickMarks] + [--clusterUsingSamples CLUSTERUSINGSAMPLES [CLUSTERUSINGSAMPLES ...]] + [--averageTypeSummaryPlot {mean,median,min,max,std,sum}] + [--missingDataColor MISSINGDATACOLOR] + [--colorMap COLORMAP [COLORMAP ...]] [--alpha ALPHA] + [--colorList COLORLIST [COLORLIST ...]] + [--colorNumber COLORNUMBER] [--zMin ZMIN [ZMIN ...]] + [--zMax ZMAX [ZMAX ...]] [--heatmapHeight HEATMAPHEIGHT] + [--heatmapWidth HEATMAPWIDTH] + [--whatToShow {plot, heatmap and colorbar,plot and heatmap,heatmap only,heatmap and colorbar}] + [--boxAroundHeatmaps BOXAROUNDHEATMAPS] + [--xAxisLabel XAXISLABEL] [--startLabel STARTLABEL] + [--endLabel ENDLABEL] [--refPointLabel REFPOINTLABEL] + [--labelRotation LABEL_ROTATION] + [--regionsLabel REGIONSLABEL [REGIONSLABEL ...]] + [--samplesLabel SAMPLESLABEL [SAMPLESLABEL ...]] + [--plotTitle PLOTTITLE] [--yAxisLabel YAXISLABEL] + [--yMin YMIN [YMIN ...]] [--yMax YMAX [YMAX ...]] + [--legendLocation {best,upper-right,upper-left,upper-center,lower-left,lower-right,lower-center,center,center-left,center-right,none}] + [--perGroup] [--plotFileFormat] [--verbose] + + This tool creates a heatmap for scores associated with genomic regions. The + program requires a matrix file generated by the tool ``computeMatrix``. + + Required arguments: + --matrixFile MATRIXFILE, -m MATRIXFILE + Matrix file from the computeMatrix tool. (default: + None) + --outFileName OUTFILENAME, -out OUTFILENAME, -o OUTFILENAME + File name to save the image to. The file ending will + be used to determine the image format. The available + options are: "png", "eps", "pdf" and "svg", e.g., + MyHeatmap.png. (default: None) + + Output options: + --outFileSortedRegions FILE + File name into which the regions are saved after + skipping zeros or min/max threshold values. The order + of the regions in the file follows the sorting order + selected. This is useful, for example, to generate + other heatmaps while keeping the sorting of the first + heatmap. Example: Heatmap1sortedRegions.bed (default: + None) + --outFileNameMatrix FILE + If this option is given, then the matrix of values + underlying the heatmap will be saved using this name, + e.g. MyMatrix.tab. (default: None) + --interpolationMethod STR + If the heatmap image contains a large number of + columns is usually better to use an interpolation + method to produce better results (see https://matplotl + ib.org/examples/images_contours_and_fields/interpolati + on_methods.html). Be default, plotHeatmap uses the + method `nearest` if the number of columns is 1000 or + less. Otherwise it uses the bilinear method. This + default behaviour can be changed by using any of the + following options: "nearest", "bilinear", "bicubic", + "gaussian" (default: auto) + --dpi DPI Set the DPI to save the figure. (default: 200) + + Clustering arguments: + --kmeans KMEANS Number of clusters to compute. When this option is + set, the matrix is split into clusters using the + k-means algorithm. Only works for data that is not + grouped, otherwise only the first group will be + clustered. If more specific clustering methods are + required, then save the underlying matrix and run the + clustering using other software. The plotting of the + clustering may fail with an error if a cluster has + very few members compared to the total number or + regions. (default: None) + --hclust HCLUST Number of clusters to compute. When this option is + set, then the matrix is split into clusters using the + hierarchical clustering algorithm, using "ward + linkage". Only works for data that is not grouped, + otherwise only the first group will be clustered. + --hclust could be very slow if you have >1000 regions. + In those cases, you might prefer --kmeans or if more + clustering methods are required you can save the + underlying matrix and run the clustering using other + software. The plotting of the clustering may fail with + an error if a cluster has very few members compared to + the total number of regions. (default: None) + --silhouette Compute the silhouette score for regions. This is only + applicable if clustering has been performed. The + silhouette score is a measure of how similar a region + is to other regions in the same cluster as opposed to + those in other clusters. It will be reported in the + final column of the BED file with regions. The + silhouette evaluation can be very slow when you have + morethan 100 000 regions. (default: False) + + Optional arguments: + --help, -h show this help message and exit + --version show program's version number and exit + --plotType {lines,fill,se,std} + "lines" will plot the profile line based on the + average type selected. "fill" fills the region between + zero and the profile curve. The fill in color is semi + transparent to distinguish different profiles. "se" + and "std" color the region between the profile and the + standard error or standard deviation of the data. + (default: lines) + --sortRegions {descend,ascend,no,keep} + Whether the heatmap should present the regions sorted. + The default is to sort in descending order based on + the mean value per region. Note that "keep" and "no" + are the same thing. (default: descend) + --sortUsing {mean,median,max,min,sum,region_length} + Indicate which method should be used for sorting. For + each row the method is computed. For region_length, a + dashed line is drawn at the end of the region + (reference point TSS and center) or the beginning of + the region (reference point TES) as appropriate. + (default: mean) + --sortUsingSamples SORTUSINGSAMPLES [SORTUSINGSAMPLES ...] + List of sample numbers (order as in matrix), that are + used for sorting by --sortUsing, no value uses all + samples, example: --sortUsingSamples 1 3 (default: + None) + --linesAtTickMarks Draw dashed lines from all tick marks through the + heatmap. This is then similar to the dashed line draw + at region bounds when using a reference point and + --sortUsing region_length (default: False) + --clusterUsingSamples CLUSTERUSINGSAMPLES [CLUSTERUSINGSAMPLES ...] + List of sample numbers (order as in matrix), that are + used for clustering by --kmeans or --hclust if not + given, all samples are taken into account for + clustering. Example: --ClusterUsingSamples 1 3 + (default: None) + --averageTypeSummaryPlot {mean,median,min,max,std,sum} + Define the type of statistic that should be plotted in + the summary image above the heatmap. The options are: + "mean", "median", "min", "max", "sum" and "std". + (default: mean) + --missingDataColor MISSINGDATACOLOR + If --missingDataAsZero was not set, such cases will be + colored in black by default. Using this parameter, a + different color can be set. A value between 0 and 1 + will be used for a gray scale (black is 0). For a list + of possible color names see: http://packages.python.or + g/ete2/reference/reference_svgcolors.html. Other + colors can be specified using the #rrggbb notation. + (default: black) + --colorMap COLORMAP [COLORMAP ...] + Color map to use for the heatmap. If more than one + heatmap is being plotted the color of each heatmap can + be enter individually (e.g. `--colorMap Reds Blues`). + Color maps are recycled if the number of color maps is + smaller than the number of heatmaps being plotted. + Available values can be seen here: + http://matplotlib.org/users/colormaps.html The + available options are: 'Accent', 'Blues', 'BrBG', + 'BuGn', 'BuPu', 'CMRmap', 'Dark2', 'GnBu', 'Greens', + 'Greys', 'OrRd', 'Oranges', 'PRGn', 'Paired', + 'Pastel1', 'Pastel2', 'PiYG', 'PuBu', 'PuBuGn', + 'PuOr', 'PuRd', 'Purples', 'RdBu', 'RdGy', 'RdPu', + 'RdYlBu', 'RdYlGn', 'Reds', 'Set1', 'Set2', 'Set3', + 'Spectral', 'Wistia', 'YlGn', 'YlGnBu', 'YlOrBr', + 'YlOrRd', 'afmhot', 'autumn', 'binary', 'bone', 'brg', + 'bwr', 'cividis', 'cool', 'coolwarm', 'copper', + 'cubehelix', 'flag', 'gist_earth', 'gist_gray', + 'gist_heat', 'gist_ncar', 'gist_rainbow', + 'gist_stern', 'gist_yarg', 'gnuplot', 'gnuplot2', + 'gray', 'hot', 'hsv', 'icefire', 'inferno', 'jet', + 'magma', 'mako', 'nipy_spectral', 'ocean', 'pink', + 'plasma', 'prism', 'rainbow', 'rocket', 'seismic', + 'spring', 'summer', 'tab10', 'tab20', 'tab20b', + 'tab20c', 'terrain', 'twilight', 'twilight_shifted', + 'viridis', 'vlag', 'winter' (default: ['RdYlBu']) + --alpha ALPHA The alpha channel (transparency) to use for the + heatmaps. The default is 1.0 and values must be + between 0 and 1. (default: 1.0) + --colorList COLORLIST [COLORLIST ...] + List of colors to use to create a colormap. For + example, if `--colorList black,yellow,blue` is set + (colors separated by comas) then a color map that + starts with black, continues to yellow and finishes in + blue is created. If this option is selected, it + overrides the --colorMap chosen. The list of valid + color names can be seen here: + http://matplotlib.org/examples/color/named_colors.html + Hex colors are valid (e.g #34a2b1). If individual + colors for different heatmaps need to be specified + they need to be separated by space as for example: + `--colorList "white,#cccccc" "white,darkred"` As for + --colorMap, the color lists are recycled if their + number is smaller thatn the number ofplotted heatmaps. + The number of transitions is defined by the + --colorNumber option. (default: None) + --colorNumber COLORNUMBER + N.B., --colorList is required for an effect. This + controls the number of transitions from one color to + the other. If --colorNumber is the number of colors in + --colorList then there will be no transitions between + the colors. (default: 256) + --zMin ZMIN [ZMIN ...], -min ZMIN [ZMIN ...] + Minimum value for the heatmap intensities. Multiple + values, separated by spaces can be set for each + heatmap. If the number of zMin values is smaller + thanthe number of heatmaps the values are recycled. + (default: None) + --zMax ZMAX [ZMAX ...], -max ZMAX [ZMAX ...] + Maximum value for the heatmap intensities. Multiple + values, separated by spaces can be set for each + heatmap. If the number of zMax values is smaller + thanthe number of heatmaps the values are recycled. + (default: None) + --heatmapHeight HEATMAPHEIGHT + Plot height in cm. The default for the heatmap height + is 28. The minimum value is 3 and the maximum is 100. + (default: 28) + --heatmapWidth HEATMAPWIDTH + Plot width in cm. The default value is 4 The minimum + value is 1 and the maximum is 100. (default: 4) + --whatToShow {plot, heatmap and colorbar,plot and heatmap,heatmap only,heatmap and colorbar} + The default is to include a summary or profile plot on + top of the heatmap and a heatmap colorbar. Other + options are: "plot and heatmap", "heatmap only", + "heatmap and colorbar", and the default "plot, heatmap + and colorbar". (default: plot, heatmap and colorbar) + --boxAroundHeatmaps BOXAROUNDHEATMAPS + By default black boxes are plot around heatmaps. This + can be turned off by setting --boxAroundHeatmaps no + (default: yes) + --xAxisLabel XAXISLABEL, -x XAXISLABEL + Description for the x-axis label. (default: gene + distance (bp)) + --startLabel STARTLABEL + [only for scale-regions mode] Label shown in the plot + for the start of the region. Default is TSS + (transcription start site), but could be changed to + anything, e.g. "peak start". Same for the --endLabel + option. See below. (default: TSS) + --endLabel ENDLABEL [only for scale-regions mode] Label shown in the plot + for the region end. Default is TES (transcription end + site). (default: TES) + --refPointLabel REFPOINTLABEL + [only for reference-point mode] Label shown in the + plot for the reference-point. Default is the same as + the reference point selected (e.g. TSS), but could be + anything, e.g. "peak start". (default: None) + --labelRotation LABEL_ROTATION + Rotation of the X-axis labels in degrees. The default + is 0, positive values denote a counter-clockwise + rotation. (default: 0.0) + --regionsLabel REGIONSLABEL [REGIONSLABEL ...], -z REGIONSLABEL [REGIONSLABEL ...] + Labels for the regions plotted in the heatmap. If more + than one region is being plotted, a list of labels + separated by spaces is required. If a label itself + contains a space, then quotes are needed. For example, + --regionsLabel label_1, "label 2". (default: None) + --samplesLabel SAMPLESLABEL [SAMPLESLABEL ...] + Labels for the samples plotted. The default is to use + the file name of the sample. The sample labels should + be separated by spaces and quoted if a label + itselfcontains a space E.g. --samplesLabel label-1 + "label 2" (default: None) + --plotTitle PLOTTITLE, -T PLOTTITLE + Title of the plot, to be printed on top of the + generated image. Leave blank for no title. (default: ) + --yAxisLabel YAXISLABEL, -y YAXISLABEL + Y-axis label for the top panel. (default: ) + --yMin YMIN [YMIN ...] + Minimum value for the Y-axis. Multiple values, + separated by spaces can be set for each profile. If + the number of yMin values is smaller thanthe number of + plots, the values are recycled. (default: None) + --yMax YMAX [YMAX ...] + Maximum value for the Y-axis. Multiple values, + separated by spaces can be set for each profile. If + the number of yMin values is smaller thanthe number of + plots, the values are recycled. (default: None) + --legendLocation {best,upper-right,upper-left,upper-center,lower-left,lower-right,lower-center,center,center-left,center-right,none} + Location for the legend in the summary plot. Note that + "none" does not work for the profiler. (default: best) + --perGroup The default is to plot all groups of regions by + sample. Using this option instead plots all samples by + group of regions. Note that this is only useful if you + have multiple groups of regions. by sample rather than + group. (default: False) + --plotFileFormat Image format type. If given, this option overrides the + image format based on the plotFile ending. The + available options are: "png", "eps", "pdf", "plotly" + and "svg" (default: None) + --verbose If set, warning messages and additional information + are given. (default: False) + + An example usage is: plotHeatmap -m \ No newline at end of file diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index f0994936..6684d650 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.15 + dockerPull: biowardrobe2/sc-tools:v0.0.21 inputs: @@ -566,11 +566,12 @@ doc: | s:about: | usage: sc_atac_cluster.R - [-h] --query QUERY [--dimensions [DIMENSIONS ...]] + [-h] --query QUERY [--dimensions [DIMENSIONS [DIMENSIONS ...]]] [--ametric {euclidean,cosine,manhattan,hamming}] [--algorithm {louvain,mult-louvain,slm,leiden}] - [--resolution [RESOLUTION ...]] [--fragments FRAGMENTS] - [--genes [GENES ...]] [--diffpeaks] [--logfc LOGFC] [--minpct MINPCT] + [--resolution [RESOLUTION [RESOLUTION ...]]] [--fragments FRAGMENTS] + [--genes [GENES [GENES ...]]] [--diffpeaks] [--logfc LOGFC] + [--minpct MINPCT] [--testuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] [--pdf] [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] @@ -578,14 +579,14 @@ s:about: | Single-cell ATAC-Seq Cluster Analysis - options: + optional arguments: -h, --help show this help message and exit --query QUERY Path to the RDS file to load Seurat object from. This file should include chromatin accessibility information stored in the ATAC assay, as well as 'atac_lsi' and 'atacumap' dimensionality reductions applied to that assay. - --dimensions [DIMENSIONS ...] + --dimensions [DIMENSIONS [DIMENSIONS ...]] Dimensionality to use when constructing nearest- neighbor graph before clustering (from 1 to 50). If single value N is provided, use from 2 to N @@ -597,7 +598,7 @@ s:about: | --algorithm {louvain,mult-louvain,slm,leiden} Algorithm for modularity optimization when running clustering. Default: slm - --resolution [RESOLUTION ...] + --resolution [RESOLUTION [RESOLUTION ...]] Clustering resolution applied to the constructed nearest-neighbor graph. Can be set as an array but only the first item from the list will be used for @@ -608,7 +609,8 @@ s:about: | Count and barcode information for every ATAC fragment used in the loaded Seurat object. File should be saved in TSV format with tbi-index file. - --genes [GENES ...] Genes of interest to build Tn5 insertion frequency + --genes [GENES [GENES ...]] + Genes of interest to build Tn5 insertion frequency plots for the nearest peaks. If loaded Seurat object includes genes expression information in the RNA assay it will be additionally shown on the right side of the diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl new file mode 100644 index 00000000..4bb7e991 --- /dev/null +++ b/tools/sc-atac-coverage.cwl @@ -0,0 +1,274 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement +- class: EnvVarRequirement + envDef: + R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/sc-tools:v0.0.21 + + +inputs: + + query_data_rds: + type: File + inputBinding: + prefix: "--query" + doc: | + Path to the RDS file to load Seurat object from. This file + should include chromatin accessibility information stored + in the ATAC assay with a proper seqinfo data. + + atac_fragments_file: + type: File + secondaryFiles: + - .tbi + inputBinding: + prefix: "--fragments" + doc: | + Count and barcode information for every ATAC fragment used in the + loaded Seurat object. File should be saved in TSV format and to be + tbi-indexed. + + splitby: + type: + - "null" + - string + - string[] + inputBinding: + prefix: "--splitby" + doc: | + Column from the Seurat object metadata to split cells into groups. + May be one of the columns added with --metadata or --barcodes + parameters. Default: split by dataset + + datasets_metadata: + type: File? + inputBinding: + prefix: "--metadata" + doc: | + Path to the TSV/CSV file to optionally extend Seurat object metadata with + categorical values using samples identities. First column - 'library_id' + should correspond to all unique values from the 'new.ident' column of the + loaded Seurat object. If any of the provided in this file columns are already + present in the Seurat object metadata, they will be overwritten. When combined + with --barcodes parameter, first the metadata will be extended, then barcode + filtering will be applied. Default: no extra metadata is added + + barcodes_data: + type: File? + inputBinding: + prefix: "--barcodes" + doc: | + Path to the TSV/CSV file to optionally prefilter and extend Seurat object + metadata be selected barcodes. First column should be named as 'barcode'. + If file includes any other columns they will be added to the Seurat object + metadata ovewriting the existing ones if those are present. + Default: all cells used, no extra metadata is added + + flank_distance: + type: int? + inputBinding: + prefix: "--flank" + doc: | + Distance in bp to flank both start and end of the each fragment in both + direction to generate cut sites coverage. Default: 5 + + verbose: + type: boolean? + inputBinding: + prefix: "--verbose" + doc: | + Print debug information. + Default: false + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix. + Default: ./sc + + parallel_memory_limit: + type: int? + inputBinding: + prefix: "--memory" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Default: 32 + + vector_memory_limit: + type: int? + default: 128 + doc: | + Maximum vector memory in GB allowed to be used by R. + Default: 128 + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. + Default: 1 + + +outputs: + + peaks_bigbed_file: + type: File + outputBinding: + glob: "*_peaks.bigBed" + doc: | + Locations of open-chromatin regions ("peaks") + in bigBed format + + cut_sites_bigwig_file: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cut_cov.bigWig" + doc: | + Genome coverage calculated for Tn5 cut sites + in bigWig format + + fragments_bigwig_file: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_frg_cov.bigWig" + doc: | + Genome coverage calculated for fragments + in bigWig format + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["sc_atac_coverage.R"] + +stdout: sc_atac_coverage_stdout.log +stderr: sc_atac_coverage_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Single-cell ATAC-Seq Genome Coverage" +s:name: "Single-cell ATAC-Seq Genome Coverage" +s:alternateName: "Creates genome coverage bigWig files from the provided fragments file and selected grouping parameters" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-atac-coverage.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell ATAC-Seq Genome Coverage + + Creates genome coverage bigWig files from the provided fragments file + and selected grouping parameters. + + --tmpdir parameter is not exposed as input. + + +s:about: | + usage: sc_atac_coverage.R + [-h] --query QUERY --fragments FRAGMENTS [--splitby [SPLITBY ...]] + [--metadata METADATA] [--barcodes BARCODES] [--flank FLANK] [--verbose] + [--tmpdir TMPDIR] [--output OUTPUT] [--cpus CPUS] [--memory MEMORY] + + Single-cell ATAC-Seq Genome Coverage + + options: + -h, --help show this help message and exit + --query QUERY Path to the RDS file to load Seurat object from. This + file should include chromatin accessibility + information stored in the ATAC assay with a proper + seqinfo data. + --fragments FRAGMENTS + Count and barcode information for every ATAC fragment + used in the loaded Seurat object. File should be saved + in TSV format and to be tbi-indexed. + --splitby [SPLITBY ...] + Column from the Seurat object metadata to split cells + into groups. May be one of the columns added with + --metadata or --barcodes parameters. Default: split by + dataset + --metadata METADATA Path to the TSV/CSV file to optionally extend Seurat + object metadata with categorical values using samples + identities. First column - 'library_id' should + correspond to all unique values from the 'new.ident' + column of the loaded Seurat object. If any of the + provided in this file columns are already present in + the Seurat object metadata, they will be overwritten. + When combined with --barcodes parameter, first the + metadata will be extended, then barcode filtering will + be applied. Default: no extra metadata is added + --barcodes BARCODES Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata be selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. Default: all cells used, no extra + metadata is added + --flank FLANK Distance in bp to flank both start and end of the each + fragment in both direction to generate cut sites + coverage. Default: 5 + --verbose Print debug information. Default: false + --tmpdir TMPDIR Directory to keep temporary files. Default: either + /tmp or defined by environment variables TMPDIR, TMP, + TEMP. + --output OUTPUT Output prefix. Default: ./sc + --cpus CPUS Number of cores/cpus to use. Default: 1 + --memory MEMORY Maximum memory in GB allowed to be shared between the + workers when using multiple --cpus. Default: 32 \ No newline at end of file diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl new file mode 100644 index 00000000..7f6c49db --- /dev/null +++ b/tools/sc-atac-dbinding.cwl @@ -0,0 +1,710 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement +- class: EnvVarRequirement + envDef: + R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/sc-tools:v0.0.21 + + +inputs: + + query_data_rds: + type: File + inputBinding: + prefix: "--query" + doc: | + Path to the RDS file to load Seurat object from. + This file should include chromatin accessibility + information stored in the ATAC assay. Additionally + 'rnaumap', and/or 'atacumap', and/or 'wnnumap' + dimensionality reductions should be present. + + atac_fragments_file: + type: File + secondaryFiles: + - .tbi + inputBinding: + prefix: "--fragments" + doc: | + Count and barcode information for every ATAC fragment + used in the loaded Seurat object. File should be saved + in TSV format with tbi-index file. + + datasets_metadata: + type: File? + inputBinding: + prefix: "--metadata" + doc: | + Path to the TSV/CSV file to optionally extend Seurat + object metadata with categorical values using samples + identities. First column - 'library_id' should + correspond to all unique values from the 'new.ident' + column of the loaded Seurat object. If any of the + provided in this file columns are already present in + the Seurat object metadata, they will be overwritten. + When combined with --barcodes parameter, first the + metadata will be extended, then barcode filtering will + be applied. Default: no extra metadata is added + + barcodes_data: + type: File? + inputBinding: + prefix: "--barcodes" + doc: | + Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata by selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. Default: all cells used, no extra + metadata is added + + groupby: + type: string? + inputBinding: + prefix: "--groupby" + doc: | + Column from the Seurat object metadata to group cells + for optional subsetting when combined with --subset + parameter. May be one of the extra metadata columns + added with --metadata or --barcodes parameters. + Ignored if --subset is not set. Default: do not + subset, include all cells into analysis. + + subset: + type: + - "null" + - string + - string[] + inputBinding: + prefix: "--subset" + doc: | + Values from the column set with --groupby parameter to + subset cells before running differential binding + analysis. Ignored if --groupby is not provided. + Default: do not subset cells, include all of them. + + splitby: + type: string + inputBinding: + prefix: "--splitby" + doc: | + Column from the Seurat object metadata to split cells + into two groups to run --second vs --first + differential binding analysis. May be one of the extra + metadata columns added with --metadata or --barcodes + parameters. + + first_cond: + type: string + inputBinding: + prefix: "--first" + doc: | + Value from the Seurat object metadata column set with + --splitby parameter to define the first group of cells + for differential binding analysis. + + second_cond: + type: string + inputBinding: + prefix: "--second" + doc: | + Value from the Seurat object metadata column set with + --splitby parameter to define the second group of + cells for differential binding analysis. + + analysis_method: + type: + - "null" + - type: enum + symbols: + - "negative-binomial" # (negbinom) Negative Binomial Generalized Linear Model (use FindMarkers with peaks from Seurat object) + - "poisson" # (poisson) Poisson Generalized Linear Model (use FindMarkers with peaks from Seurat object) + - "logistic-regression" # (LR) Logistic Regression (use FindMarkers with peaks from Seurat object) + - "mast" # (MAST) MAST package (use FindMarkers with peaks from Seurat object) + - "manorm2" # call peaks for each group with MACS2, run MAnorm2 + inputBinding: + prefix: "--test" + doc: | + Test type to use in differential binding analysis. For + all tests except manorm2, peaks present in the loaded + Seurat object will be used. If manorm2 test selected, + peaks will be called per group defined by --splitby + parameter. Default: logistic-regression + + genome_type: + type: + - "null" + - type: enum + symbols: + - "hs" + - "mm" + inputBinding: + prefix: "--genome" + doc: | + Genome type of the sequencing data loaded from the + Seurat object. It will be used for effective genome + size selection when calling peaks with MACS2. Ignored + if --test is not set to manorm2. Default: hs (2.7e9) + + minimum_qvalue: + type: float? + inputBinding: + prefix: "--qvalue" + doc: | + Minimum FDR (q-value) cutoff for MACS2 peak detection. + Ignored if --test is not set to manorm2. Default: 0.05 + + minimum_peak_gap: + type: int? + inputBinding: + prefix: "--minpeakgap" + doc: | + If a distance between peaks is smaller than the + provided value they will be merged before splitting + them into reference genomic bins of size --binsize. + Ignored if --test is not set to manorm2. Default: 150 + + bin_size: + type: int? + inputBinding: + prefix: "--binsize" + doc: | + The size of non-overlapping reference genomic bins + used by MAnorm2 when generating a table of reads + counts per peaks. Ignored if --test is not set to + manorm2. Default: 1000 + + maximum_peaks: + type: int? + inputBinding: + prefix: "--maxpeaks" + doc: | + The maximum number of the most significant (based on + qvalue) peaks to keep from each group of cells when + constructing reference genomic bins. Ignored if --test + is not set to manorm2. Default: keep all peaks + + blacklist_regions_file: + type: File? + inputBinding: + prefix: "--blacklist" + doc: | + Path to the optional BED file with the genomic + blacklist regions to be filtered out before running + differential binding analysis. Any reference genomic + bin overlapping a blacklist region will be removed + from the output. Ignored if --test is not set to + manorm2. + + maximum_padj: + type: float? + inputBinding: + prefix: "--padj" + doc: | + In the exploratory visualization part of the analysis + output only differentially bound peaks with adjusted + P-value not bigger than this value. Default: 0.05 + + minimum_logfc: + type: float? + inputBinding: + prefix: "--logfc" + doc: | + In the exploratory visualization part of the analysis + output only differentially bound peaks with log2 Fold + Change not smaller than this value. Default: 1.0 + + export_pdf_plots: + type: boolean? + inputBinding: + prefix: "--pdf" + doc: | + Export plots in PDF. + Default: false + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + inputBinding: + prefix: "--theme" + doc: | + Color theme for all generated plots. One of gray, bw, + linedraw, light, dark, minimal, classic, void. + Default: classic + + verbose: + type: boolean? + inputBinding: + prefix: "--verbose" + doc: | + Print debug information. + Default: false + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix. + Default: ./sc + + parallel_memory_limit: + type: int? + inputBinding: + prefix: "--memory" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Default: 32 + + vector_memory_limit: + type: int? + default: 128 + doc: | + Maximum vector memory in GB allowed to be used by R. + Default: 128 + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. + Default: 1 + + +outputs: + + umap_rd_rnaumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_rd_rnaumap.png" + doc: | + Cells UMAP split by selected criteria, + optionally subsetted to the specific + group (rnaumap dim. reduction). + PNG format + + umap_rd_rnaumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_rd_rnaumap.pdf" + doc: | + Cells UMAP split by selected criteria, + optionally subsetted to the specific + group (rnaumap dim. reduction). + PDF format + + umap_rd_atacumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_rd_atacumap.png" + doc: | + Cells UMAP split by selected criteria, + optionally subsetted to the specific + group (atacumap dim. reduction). + PNG format + + umap_rd_atacumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_rd_atacumap.pdf" + doc: | + Cells UMAP split by selected criteria, + optionally subsetted to the specific + group (atacumap dim. reduction). + PDF format + + umap_rd_wnnumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_rd_wnnumap.png" + doc: | + Cells UMAP split by selected criteria, + optionally subsetted to the specific + group (wnnumap dim. reduction). + PNG format + + umap_rd_wnnumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_rd_wnnumap.pdf" + doc: | + Cells UMAP split by selected criteria, + optionally subsetted to the specific + group (wnnumap dim. reduction). + PDF format + + seurat_peaks_bigbed_file: + type: File? + outputBinding: + glob: "*_seurat_peaks.bigBed" + doc: | + Peaks in bigBed format extracted + from the loaded from provided RDS + file Seurat object. + + first_fragments_bigwig_file: + type: File + outputBinding: + glob: "*_first.bigWig" + doc: | + Genome coverage in bigWig format calculated + for fragments from the cells that belong to + the group defined by the --first and + --groupby parameters. + + second_fragments_bigwig_file: + type: File + outputBinding: + glob: "*_second.bigWig" + doc: | + Genome coverage in bigWig format calculated + for fragments from the cells that belong to + the group defined by the --second and + --groupby parameters. + + first_tn5ct_bigwig_file: + type: File? + outputBinding: + glob: "*_first_tn5ct.bigWig" + doc: | + Genome coverage in bigWig format calculated + for Tn5 cut sites from the cells that belong + to the group defined by the --first and + --groupby parameters. + + second_tn5ct_bigwig_file: + type: File? + outputBinding: + glob: "*_second_tn5ct.bigWig" + doc: | + Genome coverage in bigWig format calculated + for Tn5 cut sites from the cells that belong + to the group defined by the --second and + --groupby parameters. + + first_peaks_xls_file: + type: File? + outputBinding: + glob: "*_first_peaks.xls" + doc: | + MACS2 report in XLS format for peaks + called from the Tn5 cut sites of the + cells that belong to the group defined + by the --first and --groupby parameters. + + second_peaks_xls_file: + type: File? + outputBinding: + glob: "*_second_peaks.xls" + doc: | + MACS2 report in XLS format for peaks + called from the Tn5 cut sites of the + cells that belong to the group defined + by the --second and --groupby parameters. + + first_peaks_bed_file: + type: File? + outputBinding: + glob: "*_first_peaks.narrowPeak" + doc: | + MACS2 peaks in narrowPeak format called + from the Tn5 cut sites of the cells that + belong to the group defined by the --first + and --groupby parameters. + + second_peaks_bed_file: + type: File? + outputBinding: + glob: "*_second_peaks.narrowPeak" + doc: | + MACS2 peaks in narrowPeak format called + from the Tn5 cut sites of the cells that + belong to the group defined by the --second + and --groupby parameters. + + first_summits_bed_file: + type: File? + outputBinding: + glob: "*_first_summits.bed" + doc: | + MACS2 peaks summits in BED format called + from the Tn5 cut sites of the cells that + belong to the group defined by the --first + and --groupby parameters. + + second_summits_bed_file: + type: File? + outputBinding: + glob: "*_second_summits.bed" + doc: | + MACS2 peaks summits in BED format called + from the Tn5 cut sites of the cells that + belong to the group defined by the --second + and --groupby parameters. + + diff_bound_sites: + type: File + outputBinding: + glob: "*_db_sites.tsv" + doc: | + Not filtered differentially bound sites + in TSV format + + dbnd_vlcn_plot_png: + type: File? + outputBinding: + glob: "*_dbnd_vlcn.png" + doc: | + Volcano plot of differentially bound sites. + PNG format + + dbnd_vlcn_plot_pdf: + type: File? + outputBinding: + glob: "*_dbnd_vlcn.pdf" + doc: | + Volcano plot of differentially bound sites. + PDF format + + first_enrch_bigbed_file: + type: File? + outputBinding: + glob: "*_first_enrch.bigBed" + doc: | + Peaks in bigBed format filtered by + --padj and --logfc thresholds enriched + in the group of cells defined by the + --first and --groupby parameters. + + second_enrch_bigbed_file: + type: File? + outputBinding: + glob: "*_second_enrch.bigBed" + doc: | + Peaks in bigBed format filtered by + --padj and --logfc thresholds enriched + in the group of cells defined by the + --second and --groupby parameters. + + first_enrch_bed_file: + type: File? + outputBinding: + glob: "*_first_enrch.bed" + doc: | + Peaks in BED format filtered by + --padj and --logfc thresholds enriched + in the group of cells defined by the + --first and --groupby parameters. + + second_enrch_bed_file: + type: File? + outputBinding: + glob: "*_second_enrch.bed" + doc: | + Peaks in BED format filtered by + --padj and --logfc thresholds enriched + in the group of cells defined by the + --second and --groupby parameters. + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["sc_atac_dbinding.R"] + +stdout: sc_atac_dbinding_stdout.log +stderr: sc_atac_dbinding_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Single-cell ATAC-Seq Differential Binding Analysis" +s:name: "Single-cell ATAC-Seq Differential Binding Analysis" +s:alternateName: "Identifies differential bound sites between two groups of cells" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-atac-dbinding.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell ATAC-Seq Differential Binding Analysis + + Identifies differential bound sites between two groups of cells + --tmpdir parameter is not exposed as input. + + +s:about: | + usage: sc_atac_dbinding.R [-h] --query QUERY --fragments FRAGMENTS + [--metadata METADATA] [--barcodes BARCODES] + [--groupby GROUPBY] [--subset [SUBSET [SUBSET ...]]] + --splitby SPLITBY --first FIRST --second SECOND + [--test {negative-binomial,poisson,logistic-regression,mast,manorm2}] + [--genome {hs,mm}] [--qvalue QVALUE] + [--minpeakgap MINPEAKGAP] [--binsize BINSIZE] + [--maxpeaks MAXPEAKS] [--blacklist BLACKLIST] + [--padj PADJ] [--logfc LOGFC] [--pdf] [--verbose] + [--tmpdir TMPDIR] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + + Single-cell ATAC-Seq Differential Binding Analysis + + optional arguments: + -h, --help show this help message and exit + --query QUERY Path to the RDS file to load Seurat object from. This + file should include chromatin accessibility + information stored in the ATAC assay. Additionally + 'rnaumap', and/or 'atacumap', and/or 'wnnumap' + dimensionality reductions should be present. + --fragments FRAGMENTS + Count and barcode information for every ATAC fragment + used in the loaded Seurat object. File should be saved + in TSV format with tbi-index file. + --metadata METADATA Path to the TSV/CSV file to optionally extend Seurat + object metadata with categorical values using samples + identities. First column - 'library_id' should + correspond to all unique values from the 'new.ident' + column of the loaded Seurat object. If any of the + provided in this file columns are already present in + the Seurat object metadata, they will be overwritten. + When combined with --barcodes parameter, first the + metadata will be extended, then barcode filtering will + be applied. Default: no extra metadata is added + --barcodes BARCODES Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata by selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. Default: all cells used, no extra + metadata is added + --groupby GROUPBY Column from the Seurat object metadata to group cells + for optional subsetting when combined with --subset + parameter. May be one of the extra metadata columns + added with --metadata or --barcodes parameters. + Ignored if --subset is not set. Default: do not + subset, include all cells into analysis. + --subset [SUBSET [SUBSET ...]] + Values from the column set with --groupby parameter to + subset cells before running differential binding + analysis. Ignored if --groupby is not provided. + Default: do not subset cells, include all of them. + --splitby SPLITBY Column from the Seurat object metadata to split cells + into two groups to run --second vs --first + differential binding analysis. May be one of the extra + metadata columns added with --metadata or --barcodes + parameters. + --first FIRST Value from the Seurat object metadata column set with + --splitby parameter to define the first group of cells + for differential binding analysis. + --second SECOND Value from the Seurat object metadata column set with + --splitby parameter to define the second group of + cells for differential binding analysis. + --test {negative-binomial,poisson,logistic-regression,mast,manorm2} + Test type to use in differential binding analysis. For + all tests except manorm2, peaks present in the loaded + Seurat object will be used. If manorm2 test selected, + peaks will be called per group defined by --splitby + parameter. Default: logistic-regression + --genome {hs,mm} Genome type of the sequencing data loaded from the + Seurat object. It will be used for effective genome + size selection when calling peaks with MACS2. Ignored + if --test is not set to manorm2. Default: hs (2.7e9) + --qvalue QVALUE Minimum FDR (q-value) cutoff for MACS2 peak detection. + Ignored if --test is not set to manorm2. Default: 0.05 + --minpeakgap MINPEAKGAP + If a distance between peaks is smaller than the + provided value they will be merged before splitting + them into reference genomic bins of size --binsize. + Ignored if --test is not set to manorm2. Default: 150 + --binsize BINSIZE The size of non-overlapping reference genomic bins + used by MAnorm2 when generating a table of reads + counts per peaks. Ignored if --test is not set to + manorm2. Default: 1000 + --maxpeaks MAXPEAKS The maximum number of the most significant (based on + qvalue) peaks to keep from each group of cells when + constructing reference genomic bins. Ignored if --test + is not set to manorm2. Default: keep all peaks + --blacklist BLACKLIST + Path to the optional BED file with the genomic + blacklist regions to be filtered out before running + differential binding analysis. Any reference genomic + bin overlapping a blacklist region will be removed + from the output. Ignored if --test is not set to + manorm2. + --padj PADJ In the exploratory visualization part of the analysis + output only differentially bound peaks with adjusted + P-value not bigger than this value. Default: 0.05 + --logfc LOGFC In the exploratory visualization part of the analysis + output only differentially bound peaks with log2 Fold + Change not smaller than this value. Default: 1.0 + --pdf Export plots in PDF. Default: false + --verbose Print debug information. Default: false + --tmpdir TMPDIR Directory to keep temporary files. Default: either + /tmp or defined by environment variables TMPDIR, TMP, + TEMP. + --output OUTPUT Output prefix. Default: ./sc + --theme {gray,bw,linedraw,light,dark,minimal,classic,void} + Color theme for all generated plots. Default: classic + --cpus CPUS Number of cores/cpus to use. Default: 1 + --memory MEMORY Maximum memory in GB allowed to be shared between the + workers when using multiple --cpus. Default: 32 \ No newline at end of file diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index 51fd7ded..f803bbd1 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.15 + dockerPull: biowardrobe2/sc-tools:v0.0.21 inputs: @@ -127,8 +127,8 @@ inputs: provided, use from 2 to N LSI components. If multiple values are provided, subset to only selected LSI components. In combination with --ntgr set to harmony, - selected principle components will be used in Harmony - integration. + multiple values will result in using all dimensions + starting from 1(!) to the max of the provided values. Default: from 2 to 10 umap_spread: @@ -383,6 +383,102 @@ outputs: Split by grouping condition cells UMAP. PDF format + umap_spl_umi_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_umi.png" + doc: | + Split by the UMI per cell counts cells UMAP. + PNG format + + umap_spl_umi_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_umi.pdf" + doc: | + Split by the UMI per cell counts cells UMAP. + PDF format + + umap_spl_peak_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_peak.png" + doc: | + Split by the peaks per cell counts cells UMAP. + PNG format + + umap_spl_peak_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_peak.pdf" + doc: | + Split by the peaks per cell counts cells UMAP. + PDF format + + umap_spl_tss_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_tss.png" + doc: | + Split by the TSS enrichment score cells UMAP. + PNG format + + umap_spl_tss_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_tss.pdf" + doc: | + Split by the TSS enrichment score cells UMAP. + PDF format + + umap_spl_ncls_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_ncls.png" + doc: | + Split by the nucleosome signal cells UMAP. + PNG format + + umap_spl_ncls_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_ncls.pdf" + doc: | + Split by the nucleosome signal cells UMAP. + PDF format + + umap_spl_frip_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_frip.png" + doc: | + Split by the FRiP cells UMAP. + PNG format + + umap_spl_frip_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_frip.pdf" + doc: | + Split by the FRiP cells UMAP. + PDF format + + umap_spl_blck_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_blck.png" + doc: | + Split by the genomic blacklist regions fraction cells UMAP. + PNG format + + umap_spl_blck_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_blck.pdf" + doc: | + Split by the genomic blacklist regions fraction cells UMAP. + PDF format + ucsc_cb_config_data: type: Directory? outputBinding: @@ -494,13 +590,15 @@ s:about: | usage: sc_atac_reduce.R [-h] --query QUERY [--metadata METADATA] [--barcodes BARCODES] [--norm {log-tfidf,tf-logidf,logtf-logidf,idf}] - [--ntgr {signac,harmony,none}] [--ntgrby [NTGRBY ...]] - [--minvarpeaks MINVARPEAKS] [--dimensions [DIMENSIONS ...]] - [--uspread USPREAD] [--umindist UMINDIST] [--uneighbors UNEIGHBORS] - [--umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis, - mahalanobis,wminkowski,seuclidean,cosine,correlation,haversine, - hamming,jaccard,dice,russelrao,kulsinski,ll_dirichlet,hellinger, - rogerstanimoto,sokalmichener,sokalsneath,yule}] + [--ntgr {signac,harmony,none}] [--ntgrby [NTGRBY [NTGRBY ...]]] + [--minvarpeaks MINVARPEAKS] + [--dimensions [DIMENSIONS [DIMENSIONS ...]]] [--uspread USPREAD] + [--umindist UMINDIST] [--uneighbors UNEIGHBORS] + [--umetric {euclidean,manhattan,chebyshev,minkowski,canberra, + braycurtis,mahalanobis,wminkowski,seuclidean,cosine, + correlation,haversine,hamming,jaccard,dice,russelrao, + kulsinski,ll_dirichlet,hellinger,rogerstanimoto, + sokalmichener,sokalsneath,yule}] [--umethod {uwot,uwot-learn,umap-learn}] [--pdf] [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] @@ -508,7 +606,7 @@ s:about: | Single-cell ATAC-Seq Dimensionality Reduction Analysis - options: + optional arguments: -h, --help show this help message and exit --query QUERY Path to the RDS file to load Seurat object from. This file should include chromatin accessibility @@ -540,7 +638,7 @@ s:about: | Integration method used for joint analysis of multiple datasets. Automatically set to 'none' if loaded Suerat object includes only one dataset. Default: signac - --ntgrby [NTGRBY ...] + --ntgrby [NTGRBY [NTGRBY ...]] Column(s) from the Seurat object metadata to define the variable(s) that should be integrated out when running multiple datasets integration with harmony. @@ -554,14 +652,15 @@ s:about: | cells peaks as highly variable. These peaks are used for datasets integration, scaling and dimensionality reduction. Default: 0 (use all available peaks) - --dimensions [DIMENSIONS ...] + --dimensions [DIMENSIONS [DIMENSIONS ...]] Dimensionality to use for datasets integration and UMAP projection (from 2 to 50). If single value N is provided, use from 2 to N LSI components. If multiple values are provided, subset to only selected LSI components. In combination with --ntgr set to harmony, - selected principle components will be used in Harmony - integration. Default: from 2 to 10 + multiple values will result in using all dimensions + starting from 1(!) to the max of the provided values. + Default: from 2 to 10 --uspread USPREAD The effective scale of embedded points on UMAP. In combination with '--mindist' it determines how clustered/clumped the embedded points are. Default: 1 @@ -577,10 +676,10 @@ s:about: | structure being preserved at the loss of detailed local structure. In general this parameter should often be in the range 5 to 50. Default: 30 - --umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis,mahalanobis, - wminkowski,seuclidean,cosine,correlation,haversine,hamming,jaccard,dice, - russelrao,kulsinski,ll_dirichlet,hellinger,rogerstanimoto,sokalmichener, - sokalsneath,yule} + --umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis, + mahalanobis,wminkowski,seuclidean,cosine,correlation,haversine, + hamming,jaccard,dice,russelrao,kulsinski,ll_dirichlet,hellinger, + rogerstanimoto,sokalmichener,sokalsneath,yule} The metric to use to compute distances in high dimensional space for UMAP. Default: cosine --umethod {uwot,uwot-learn,umap-learn} diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index 36a4746a..d59cfcb9 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.15 + dockerPull: biowardrobe2/sc-tools:v0.0.21 inputs: @@ -911,14 +911,14 @@ s:about: | [--rnatestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] [--ataclogfc ATACLOGFC] [--atacminpct ATACMINPCT] [--atactestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] - [--fragments FRAGMENTS] [--genes [GENES ...]] [--pdf] [--verbose] - [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] + [--fragments FRAGMENTS] [--genes [GENES [GENES ...]]] [--pdf] + [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] Single-cell Manual Cell Type Assignment - options: + optional arguments: -h, --help show this help message and exit --query QUERY Path to the RDS file to load Seurat object from. This file should include genes expression and/or chromatin @@ -985,7 +985,8 @@ s:about: | used in the loaded Seurat object. File should be saved in TSV format with tbi-index file. Ignored if the loaded Seurat object doesn't include ATAC assay. - --genes [GENES ...] Genes of interest to build gene expression and/or Tn5 + --genes [GENES [GENES ...]] + Genes of interest to build gene expression and/or Tn5 insertion frequency plots for the nearest peaks. To build gene expression plots the loaded Seurat object should include RNA assay. To build Tn5 insertion diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index 6c89c170..bdb9adb2 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.15 + dockerPull: biowardrobe2/sc-tools:v0.0.21 inputs: @@ -55,6 +55,13 @@ inputs: doc: | Path to the genome annotation file in GTF format. + chrom_length_file: + type: File + inputBinding: + prefix: "--seqinfo" + doc: | + Path to the headerless chromosome length file in TSV format + grouping_data: type: File? inputBinding: @@ -258,6 +265,60 @@ inputs: scores filters. Default: do not call peaks + remove_doublets: + type: + - "null" + - type: enum + symbols: + - "union" + - "onlyrna" + - "onlyatac" + - "intersect" + inputBinding: + prefix: "--removedoublets" + doc: | + Remove cells that were identified as doublets. For + RNA assay cells with UMI < 200 will not be evaluated. + Default: do not remove doublets + + rna_doublet_rate: + type: float? + inputBinding: + prefix: "--rnadbr" + doc: | + Expected RNA doublet rate. Default: 1 percent per + thousand cells captured with 10x genomics + + rna_doublet_rate_sd: + type: float? + inputBinding: + prefix: "--rnadbrsd" + doc: | + Uncertainty range in the RNA doublet rate, interpreted as + a +/- around the value provided in --rnadbr. Set to 0 to + disable. Set to 1 to make the threshold depend entirely + on the misclassification rate. Default: 40 percents of the + value provided in --rnadbr + + atac_doublet_rate: + type: float? + inputBinding: + prefix: "--atacdbr" + doc: | + Expected ATAC doublet rate. Default: 1 percent per thousand + cells captured with 10x genomics + + atac_doublet_rate_sd: + type: float? + inputBinding: + prefix: "--atacdbrsd" + doc: | + Uncertainty range in the ATAC doublet rate, interpreted as + a +/- around the value provided in --atacdbr. Set to 0 to + disable. Set to 1 to make the threshold depend entirely + on the misclassification rate. Default: 40 percents of the + value provided in --atacdbr + export_pdf_plots: type: boolean? inputBinding: @@ -576,6 +637,54 @@ outputs: QC metrics per cell density (not filtered). PDF format + raw_rnadbl_plot_png: + type: File? + outputBinding: + glob: "*_raw_rnadbl.png" + doc: | + Percentage of RNA doublets per dataset (not filtered). + PNG format + + raw_rnadbl_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_rnadbl.pdf" + doc: | + Percentage of RNA doublets per dataset (not filtered). + PDF format + + raw_atacdbl_plot_png: + type: File? + outputBinding: + glob: "*_raw_atacdbl.png" + doc: | + Percentage of ATAC doublets per dataset (not filtered). + PNG format + + raw_atacdbl_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_atacdbl.pdf" + doc: | + Percentage of ATAC doublets per dataset (not filtered). + PDF format + + raw_vrlpdbl_plot_png: + type: File? + outputBinding: + glob: "*_raw_vrlpdbl.png" + doc: | + Doublets overlap for RNA and ATAC assays per dataset (not filtered). + PNG format + + raw_vrlpdbl_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_vrlpdbl.pdf" + doc: | + Doublets overlap for RNA and ATAC assays per dataset (not filtered). + PDF format + raw_tss_nrch_plot_png: type: File? outputBinding: @@ -948,6 +1057,54 @@ outputs: QC metrics per cell density (intermediate filtered). PDF format + mid_fltr_rnadbl_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_rnadbl.png" + doc: | + Percentage of RNA doublets per dataset (intermediate filtered). + PNG format + + mid_fltr_rnadbl_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_rnadbl.pdf" + doc: | + Percentage of RNA doublets per dataset (intermediate filtered). + PDF format + + mid_fltr_atacdbl_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_atacdbl.png" + doc: | + Percentage of ATAC doublets per dataset (intermediate filtered). + PNG format + + mid_fltr_atacdbl_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_atacdbl.pdf" + doc: | + Percentage of ATAC doublets per dataset (intermediate filtered). + PDF format + + mid_fltr_vrlpdbl_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_vrlpdbl.png" + doc: | + Doublets overlap for RNA and ATAC assays per dataset (intermediate filtered). + PNG format + + mid_fltr_vrlpdbl_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_fltr_vrlpdbl.pdf" + doc: | + Doublets overlap for RNA and ATAC assays per dataset (intermediate filtered). + PDF format + mid_fltr_tss_nrch_plot_png: type: File? outputBinding: @@ -1288,6 +1445,54 @@ outputs: UMI per cell correlation for RNA vs ATAC assays (filtered). PDF format + fltr_rnadbl_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_rnadbl.png" + doc: | + Percentage of RNA doublets per dataset (filtered). + PNG format + + fltr_rnadbl_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_rnadbl.pdf" + doc: | + Percentage of RNA doublets per dataset (filtered). + PDF format + + fltr_atacdbl_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_atacdbl.png" + doc: | + Percentage of ATAC doublets per dataset (filtered). + PNG format + + fltr_atacdbl_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_atacdbl.pdf" + doc: | + Percentage of ATAC doublets per dataset (filtered). + PDF format + + fltr_vrlpdbl_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_vrlpdbl.png" + doc: | + Doublets overlap for RNA and ATAC assays per dataset (filtered). + PNG format + + fltr_vrlpdbl_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_vrlpdbl.pdf" + doc: | + Doublets overlap for RNA and ATAC assays per dataset (filtered). + PDF format + fltr_tss_atac_umi_corr_plot_png: type: File? outputBinding: @@ -1587,24 +1792,39 @@ doc: | s:about: | - usage: sc_multiome_filter.R - [-h] --mex MEX --identity IDENTITY --fragments FRAGMENTS --annotations - ANNOTATIONS [--grouping GROUPING] [--blacklist BLACKLIST] - [--barcodes BARCODES] [--rnamincells RNAMINCELLS] - [--mingenes [MINGENES ...]] [--maxgenes [MAXGENES ...]] - [--rnaminumi [RNAMINUMI ...]] [--mitopattern MITOPATTERN] - [--maxmt MAXMT] [--minnovelty [MINNOVELTY ...]] - [--atacmincells ATACMINCELLS] [--atacminumi [ATACMINUMI ...]] - [--maxnuclsignal [MAXNUCLSIGNAL ...]] - [--mintssenrich [MINTSSENRICH ...]] [--minfrip [MINFRIP ...]] - [--maxblacklist [MAXBLACKLIST ...]] [--callby CALLBY] [--pdf] - [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] - [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] - [--cpus CPUS] [--memory MEMORY] + usage: sc_multiome_filter.R [-h] --mex MEX --identity IDENTITY + --fragments FRAGMENTS --annotations + ANNOTATIONS --seqinfo SEQINFO + [--grouping GROUPING] + [--blacklist BLACKLIST] + [--barcodes BARCODES] + [--rnamincells RNAMINCELLS] + [--mingenes [MINGENES [MINGENES ...]]] + [--maxgenes [MAXGENES [MAXGENES ...]]] + [--rnaminumi [RNAMINUMI [RNAMINUMI ...]]] + [--mitopattern MITOPATTERN] + [--maxmt MAXMT] + [--minnovelty [MINNOVELTY [MINNOVELTY ...]]] + [--atacmincells ATACMINCELLS] + [--atacminumi [ATACMINUMI [ATACMINUMI ...]]] + [--maxnuclsignal [MAXNUCLSIGNAL [MAXNUCLSIGNAL ...]]] + [--mintssenrich [MINTSSENRICH [MINTSSENRICH ...]]] + [--minfrip [MINFRIP [MINFRIP ...]]] + [--maxblacklist [MAXBLACKLIST [MAXBLACKLIST ...]]] + [--callby CALLBY] + [--removedoublets {union,onlyrna,onlyatac,intersect}] + [--rnadbr RNADBR] + [--rnadbrsd RNADBRSD] + [--atacdbr ATACDBR] + [--atacdbrsd ATACDBRSD] [--pdf] + [--verbose] [--h5seurat] [--h5ad] + [--cbbuild] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] Single-cell Multiome ATAC and RNA-Seq Filtering Analysis - options: + optional arguments: -h, --help show this help message and exit --mex MEX Path to the folder with feature-barcode matrix from Cell Ranger ARC Count/Aggregate experiment in MEX @@ -1624,6 +1844,8 @@ s:about: | file is required. --annotations ANNOTATIONS Path to the genome annotation file in GTF format + --seqinfo SEQINFO Path to the headerless chromosome length file in TSV + format --grouping GROUPING Path to the TSV/CSV file to define datasets grouping. First column - 'library_id' with the values and order that correspond to the 'library_id' column from the ' @@ -1642,19 +1864,19 @@ s:about: | --rnamincells RNAMINCELLS Include only genes detected in at least this many cells. Default: 5 (applied to all datasets) - --mingenes [MINGENES ...] + --mingenes [MINGENES [MINGENES ...]] Include cells where at least this many genes are detected. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. Default: 250 (applied to all datasets) - --maxgenes [MAXGENES ...] + --maxgenes [MAXGENES [MAXGENES ...]] Include cells with the number of genes not bigger than this value. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. Default: 5000 (applied to all datasets) - --rnaminumi [RNAMINUMI ...] + --rnaminumi [RNAMINUMI [RNAMINUMI ...]] Include cells where at least this many UMI (RNA transcripts) are detected. If multiple values provided, each of them will be applied to the @@ -1667,7 +1889,7 @@ s:about: | --maxmt MAXMT Include cells with the percentage of transcripts mapped to mitochondrial genes not bigger than this value. Default: 5 (applied to all datasets) - --minnovelty [MINNOVELTY ...] + --minnovelty [MINNOVELTY [MINNOVELTY ...]] Include cells with the novelty score not lower than this value, calculated for as log10(genes)/log10(UMI) for RNA assay. If multiple values provided, each of @@ -1677,14 +1899,14 @@ s:about: | --atacmincells ATACMINCELLS Include only peaks detected in at least this many cells. Default: 5 (applied to all datasets) - --atacminumi [ATACMINUMI ...] + --atacminumi [ATACMINUMI [ATACMINUMI ...]] Include cells where at least this many UMI (ATAC transcripts) are detected. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. Default: 1000 (applied to all datasets) - --maxnuclsignal [MAXNUCLSIGNAL ...] + --maxnuclsignal [MAXNUCLSIGNAL [MAXNUCLSIGNAL ...]] Include cells with the nucleosome signal not bigger than this value. Nucleosome signal quantifies the approximate ratio of mononucleosomal to nucleosome- @@ -1692,7 +1914,7 @@ s:about: | them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. Default: 4 (applied to all datasets) - --mintssenrich [MINTSSENRICH ...] + --mintssenrich [MINTSSENRICH [MINTSSENRICH ...]] Include cells with the TSS enrichment score not lower than this value. Score is calculated based on the ratio of fragments centered at the TSS to fragments in @@ -1700,14 +1922,14 @@ s:about: | each of them will be applied to the correspondent dataset from the '--mex' input based on the '-- identity' file. Default: 2 (applied to all datasets) - --minfrip [MINFRIP ...] + --minfrip [MINFRIP [MINFRIP ...]] Include cells with the FRiP not lower than this value. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. FRiP is calculated for fragments. Default: 0.15 (applied to all datasets) - --maxblacklist [MAXBLACKLIST ...] + --maxblacklist [MAXBLACKLIST [MAXBLACKLIST ...]] Include cells with the fraction of fragments in genomic blacklist regions not bigger than this value. If multiple values provided, each of them will be @@ -1722,6 +1944,26 @@ s:about: | only after applying all RNA related thresholds, maximum nucleosome signal, and minimum TSS enrichment scores filters. Default: do not call peaks + --removedoublets {union,onlyrna,onlyatac,intersect} + Remove cells that were identified as doublets. For RNA + assay cells with UMI < 200 will not be evaluated. + Default: do not remove doublets + --rnadbr RNADBR Expected RNA doublet rate. Default: 1 percent per + thousand cells captured with 10x genomics + --rnadbrsd RNADBRSD Uncertainty range in the RNA doublet rate, interpreted + as a +/- around the value provided in --rnadbr. Set to + 0 to disable. Set to 1 to make the threshold depend + entirely on the misclassification rate. Default: 40 + percents of the value provided in --rnadbr + --atacdbr ATACDBR Expected ATAC doublet rate. Default: 1 percent per + thousand cells captured with 10x genomics + --atacdbrsd ATACDBRSD + Uncertainty range in the ATAC doublet rate, + interpreted as a +/- around the value provided in + --atacdbr. Set to 0 to disable. Set to 1 to make the + threshold depend entirely on the misclassification + rate. Default: 40 percents of the value provided in + --atacdbr --pdf Export plots in PDF. Default: false --verbose Print debug information. Default: false --h5seurat Save Seurat data to h5seurat file. Default: false diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index 2ca33041..b35a10f6 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.15 + dockerPull: biowardrobe2/sc-tools:v0.0.21 inputs: @@ -207,6 +207,14 @@ inputs: Save Seurat data to h5ad file. Default: false + export_scope_data: + type: boolean? + inputBinding: + prefix: "--scope" + doc: | + Save Seurat data to SCope compatible loom file. + Default: false + export_ucsc_cb: type: boolean? inputBinding: @@ -645,6 +653,13 @@ outputs: doc: | Reduced Seurat data in h5ad format + seurat_data_scope: + type: File? + outputBinding: + glob: "*_data.loom" + doc: | + Reduced Seurat data in SCope compatible loom format + stdout_log: type: stdout @@ -712,11 +727,12 @@ doc: | s:about: | usage: sc_rna_cluster.R - [-h] --query QUERY [--dimensions [DIMENSIONS ...]] + [-h] --query QUERY [--dimensions [DIMENSIONS [DIMENSIONS ...]]] [--ametric {euclidean,cosine,manhattan,hamming}] [--algorithm {louvain,mult-louvain,slm,leiden}] - [--resolution [RESOLUTION ...]] [--genes [GENES ...]] [--diffgenes] - [--logfc LOGFC] [--minpct MINPCT] [--onlypos] + [--resolution [RESOLUTION [RESOLUTION ...]]] + [--genes [GENES [GENES ...]]] [--diffgenes] [--logfc LOGFC] + [--minpct MINPCT] [--onlypos] [--testuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] [--pdf] [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] @@ -724,14 +740,14 @@ s:about: | Single-cell RNA-Seq Cluster Analysis - options: + optional arguments: -h, --help show this help message and exit --query QUERY Path to the RDS file to load Seurat object from. This file should include genes expression information stored in the RNA assay, as well as 'pca' and 'rnaumap' dimensionality reductions applied to that assay. - --dimensions [DIMENSIONS ...] + --dimensions [DIMENSIONS [DIMENSIONS ...]] Dimensionality to use when constructing nearest- neighbor graph before clustering (from 1 to 50). If single value N is provided, use from 1 to N @@ -743,14 +759,15 @@ s:about: | --algorithm {louvain,mult-louvain,slm,leiden} Algorithm for modularity optimization when running clustering. Default: louvain - --resolution [RESOLUTION ...] + --resolution [RESOLUTION [RESOLUTION ...]] Clustering resolution applied to the constructed nearest-neighbor graph. Can be set as an array but only the first item from the list will be used for cluster labels and gene markers in the UCSC Cell Browser when running with --cbbuild and --diffgenes parameters. Default: 0.3, 0.5, 1.0 - --genes [GENES ...] Genes of interest to build genes expression plots. + --genes [GENES [GENES ...]] + Genes of interest to build genes expression plots. Default: None --diffgenes Identify differentially expressed genes (putative gene markers) between each pair of clusters for all diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index 0eedf0a9..47e7f1de 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.15 + dockerPull: biowardrobe2/sc-tools:v0.0.21 inputs: @@ -534,16 +534,16 @@ doc: | s:about: | usage: sc_rna_da_cells.R [-h] --query QUERY [--reduction REDUCTION] - [--dimensions [DIMENSIONS ...]] [--knn [KNN ...]] [--metadata METADATA] - --splitby SPLITBY --first FIRST --second SECOND - [--resolution [RESOLUTION ...]] [--ranges RANGES RANGES] [--pdf] - [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] + [--dimensions [DIMENSIONS [DIMENSIONS ...]]] [--knn [KNN [KNN ...]]] + [--metadata METADATA] --splitby SPLITBY --first FIRST --second SECOND + [--resolution [RESOLUTION [RESOLUTION ...]]] [--ranges RANGES RANGES] + [--pdf] [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] Single-cell Differential Abundance Analysis - options: + optional arguments: -h, --help show this help message and exit --query QUERY Path to the RDS file to load Seurat object from. This file should include genes expression information @@ -554,12 +554,13 @@ s:about: | --reduction REDUCTION Dimensionality reduction to be used for DA analysis. Default: pca - --dimensions [DIMENSIONS ...] + --dimensions [DIMENSIONS [DIMENSIONS ...]] Dimensionality to use when running DA analysis (from 1 to 50). If single value N is provided, use from 1 to N PCs. If multiple values are provided, subset to only selected PCs. Default: from 1 to 10 - --knn [KNN ...] Array of k values for kNN graph construction when + --knn [KNN [KNN ...]] + Array of k values for kNN graph construction when calculating the score vector for each cell to represent the DA behavior in the neighborhood. Default: calculated based on the cells number @@ -581,7 +582,7 @@ s:about: | --second SECOND Value from the Seurat object metadata column set with --splitby to define the second group of cells for DA analysis. - --resolution [RESOLUTION ...] + --resolution [RESOLUTION [RESOLUTION ...]] Clustering resolution applied to DA cells to identify DA cells populations. Can be set as an array. Default: 0.01, 0.03, 0.05 diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index 9ff80079..78efbe58 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.15 + dockerPull: biowardrobe2/sc-tools:v0.0.21 inputs: @@ -21,102 +21,152 @@ inputs: inputBinding: prefix: "--query" doc: | - Path to the RDS file to load Seurat object from. This file should include genes - expression information stored in the RNA assay. Additionally, 'rnaumap', and/or - 'atacumap', and/or 'wnnumap' dimensionality reductions should be present. + Path to the RDS file to load Seurat object from. This + file should include genes expression information + stored in the RNA assay. Additionally, 'rnaumap', + and/or 'atacumap', and/or 'wnnumap' dimensionality + reductions should be present. datasets_metadata: type: File? inputBinding: prefix: "--metadata" doc: | - Path to the TSV/CSV file to optionally extend Seurat object metadata with - categorical values using samples identities. First column - 'library_id' - should correspond to all unique values from the 'new.ident' column of the - loaded Seurat object. If any of the provided in this file columns are already - present in the Seurat object metadata, they will be overwritten. Default: no - extra metadata is added + Path to the TSV/CSV file to optionally extend Seurat + object metadata with categorical values using samples + identities. First column - 'library_id' should + correspond to all unique values from the 'new.ident' + column of the loaded Seurat object. If any of the + provided in this file columns are already present in + the Seurat object metadata, they will be overwritten. + When combined with --barcodes parameter, first the + metadata will be extended, then barcode filtering will + be applied. Default: no extra metadata is added + + barcodes_data: + type: File? + inputBinding: + prefix: "--barcodes" + doc: | + Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata by selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. Default: all cells used, no extra + metadata is added + + groupby: + type: string? + inputBinding: + prefix: "--groupby" + doc: | + Column from the Seurat object metadata to group cells + for optional subsetting when combined with --subset + parameter. May be one of the extra metadata columns + added with --metadata or --barcodes parameters. + Ignored if --subset is not set. Default: do not + subset, include all cells into analysis. + + subset: + type: + - "null" + - string + - string[] + inputBinding: + prefix: "--subset" + doc: | + Values from the column set with --groupby parameter to + subset cells before running differential expression + analysis. Ignored if --groupby is not provided. + Default: do not subset cells, include all of them splitby: type: string inputBinding: prefix: "--splitby" doc: | - Column from the Seurat object metadata to split datasets into two groups - to run --second vs --first pseudobulk DE analysis, i.e., calculate log2FC. - May be one of the columns from the extra metadata added with --metadata - parameter. Provided value should group the datasets, not cells, therefore - do not use a column with clustering results. + Column from the Seurat object metadata to split cells + into two groups to run --second vs --first + differential expression analysis. May be one of the + extra metadata columns added with --metadata or + --barcodes parameters. first_cond: type: string inputBinding: prefix: "--first" doc: | - Value from the Seurat object metadata column set with --splitby to define the - first group of datasets for pseudobulk DE analysis. + Value from the Seurat object metadata column set with + --splitby parameter to define the first group of cells + for differential expression analysis. second_cond: type: string inputBinding: prefix: "--second" doc: | - Value from the Seurat object metadata column set with --splitby to define the - second group of datasets for pseudobulk DE analysis. + Value from the Seurat object metadata column set with + --splitby parameter to define the second group of + cells for differential expression analysis. - batchby: - type: string? - inputBinding: - prefix: "--batchby" - doc: | - Column from the Seurat object metadata to group datasets into batches. It will be used - as a factor variable to model batch effect when running pseudobulk DE analysis (makes - design formula look like ~splitby+batchby). May be one of the columns from the extra - metadata added with --metadata parameter. Provided value should batch the datasets, not - cells, therefore do not use a column with clustering results. Default: do not model - batch effect. - - groupby: - type: string? - inputBinding: - prefix: "--groupby" - doc: | - Column from the Seurat object metadata to group cells for optional subsetting - when combined with --subset parameter. May be one of the columns from the extra - metadata added with --metadata parameter. Ignored if --subset is not set. Provided - value defines the groups of cells, therefore any metadata column, including the - clustering results, may be used. Default: do not subset, run pseudobulk DE analysis - for all cells jointly - - subset: + analysis_method: type: - "null" - - string - - string[] + - type: enum + symbols: + - "wilcoxon" # (wilcox) Wilcoxon Rank Sum test + - "likelihood-ratio" # (bimod) Likelihood-ratio test + - "t-test" # (t) Student's t-test + - "negative-binomial" # (negbinom) Negative Binomial Generalized Linear Model (supports --batchby) + - "poisson" # (poisson) Poisson Generalized Linear Model (supports --batchby) + - "logistic-regression" # (LR) Logistic Regression (supports --batchby) + - "mast" # (MAST) MAST package (supports --batchby) + - "deseq" # DESeq2 Wald test on pseudobulk aggregated gene expression + - "deseq-lrt" # DESeq2 LRT test on pseudobulk aggregated gene expression inputBinding: - prefix: "--subset" - doc: | - Value(s) from the column set with --groupby parameter to subset cells - before running pseudobulk DE analysis. If multiple values are provided - run analysis jointly for selected groups of cells. Ignored if --groupby - is not set. Default: do not subset, run pseudobulk DE analysis for all - cells jointly + prefix: "--test" + doc: | + Test type to use in differential expression analysis. + If set to deseq or deseq-lrt, gene expression will be + aggregated to the pseudobulk level per dataset. For + deseq, the pair-wise Wald test will be used. For + deseq-lrt, the reduced formula will look like ~1 if + --batchby parameter is omitted or will be set to + ~batchby to exclude the criteria if interest (defined + by --splitby). For all other values of the --test + parameter the FindMarkers function will be used (genes + will be prefiltered by minimum percentage >= 0.1 and + by minimum log2FoldChange >= 0.25 before running + differential expression analysis). Default: use + FindMarkers with Wilcoxon Rank Sum test. - lrt: - type: boolean? + batchby: + type: string? inputBinding: - prefix: "--lrt" + prefix: "--batchby" doc: | - Use LRT instead of the pair-wise Wald test. If --batchby is not provided - use ~1 as a reduced formula, otherwise ~batchby. Default: use Wald test + Column from the Seurat object metadata to group cells + into batches. If --test is set to deseq or deseq-lrt + the --batchby parameter will be used in the design + formula in the following way ~splitby+batchby. If + --test is set to negative-binomial, poisson, logistic- + regression, or mast it will be used as a latent + variable in the FindMarkers function. Not supported + for --test values equal to wilcoxon, likelihood-ratio, + or t-test. May be one of the extra metadata columns + added with --metadata or --barcodes parameters. + Default: do not model batch effect. maximum_padj: type: float? inputBinding: prefix: "--padj" doc: | - In the exploratory visualization part of the analysis output only features - with adjusted P-value not bigger than this value. Default: 0.05 + In the exploratory visualization part of the analysis + output only differentially expressed genes with + adjusted P-value not bigger than this value. + Default: 0.05 genes_of_interest: type: @@ -126,42 +176,20 @@ inputs: inputBinding: prefix: "--genes" doc: | - Genes of interest to label on the generated plots. Default: top 10 genes - with the highest and the lowest log2FC expression values. + Genes of interest to label on the generated plots. + Default: top 10 genes with the highest and the lowest + log2FoldChange values. exclude_pattern: type: string? inputBinding: prefix: "--exclude" doc: | - Regex pattern to identify and exclude non-coding RNA genes from the pseudobulk - DE analysis (not case-sensitive). If any of such genes were provided in the --genes - parameter, they will be excluded from there as well. - Default: use all genes - - normalization_method: - type: - - "null" - - type: enum - symbols: - - "vst" - - "rlog" - inputBinding: - prefix: "--norm" - doc: | - Read counts normalization for the exploratory visualization part of the analysis. - Use 'vst' for medium-to-large datasets (n > 30) and 'rlog' for small datasets - (n < 30), when there is a wide range of sequencing depth across samples. - Default: rlog - - remove: - type: boolean? - inputBinding: - prefix: "--remove" - doc: | - Remove batch effect when generating normalized read counts for the exploratory - visualization part of the analysis. Ignored if --batchby is not provided. - Default: do not remove batch effect from normalized read counts. + Regex pattern to identify and exclude specific genes + from the differential expression analysis (not case- + sensitive). If any of such genes are provided in the + --genes parameter, they will be excluded from there as + well. Default: use all genes cluster_method: type: @@ -174,9 +202,11 @@ inputs: inputBinding: prefix: "--cluster" doc: | - Hopach clustering method to be run on normalized read counts for the - exploratory visualization part of the analysis. Default: do not run - clustering + Hopach clustering method to be run on the normalized + read counts for the exploratory visualization part of + the analysis. Clustering by column is supported only + when --test is set to deseq or deseq-lrt. Default: do + not run clustering row_distance: type: @@ -192,8 +222,9 @@ inputs: inputBinding: prefix: "--rowdist" doc: | - Distance metric for HOPACH row clustering. Ignored if --cluster is set - to column or not provided. Default: cosangle + Distance metric for HOPACH row clustering. Ignored if + --cluster is set to column or not provided. + Default: cosangle column_distance: type: @@ -209,17 +240,19 @@ inputs: inputBinding: prefix: "--columndist" doc: | - Distance metric for HOPACH column clustering. Ignored if --cluster is set - to row or not provided. Default: euclid + Distance metric for HOPACH column clustering. Ignored + if --cluster is set to row or not provided. + Default: euclid center_row: type: boolean? inputBinding: prefix: "--center" doc: | - Apply mean centering for gene expression prior to running - clustering by row. Ignored if --cluster is set to column or - not provided. Default: do not centered + Apply mean centering for gene expression prior to + running clustering by row. Ignored if --cluster is + set to column or not provided. Default: do not + center export_pdf_plots: type: boolean? @@ -245,8 +278,8 @@ inputs: inputBinding: prefix: "--theme" doc: | - Color theme for all generated plots. One of gray, bw, linedraw, light, - dark, minimal, classic, void. + Color theme for all generated plots. One of gray, bw, + linedraw, light, dark, minimal, classic, void. Default: classic verbose: @@ -270,8 +303,8 @@ inputs: inputBinding: prefix: "--memory" doc: | - Maximum memory in GB allowed to be shared between the workers - when using multiple --cpus. + Maximum memory in GB allowed to be shared between + the workers when using multiple --cpus. Default: 32 vector_memory_limit: @@ -297,9 +330,9 @@ outputs: outputBinding: glob: "*_umap_rd_rnaumap.png" doc: | - Cells UMAP split by selected biological condition, optionally - subsetted to the specific cluster or cell type (rnaumap dim. - reduction). + Cells UMAP split by selected criteria, + optionally subsetted to the specific + group (rnaumap dim. reduction). PNG format umap_rd_rnaumap_plot_pdf: @@ -307,9 +340,9 @@ outputs: outputBinding: glob: "*_umap_rd_rnaumap.pdf" doc: | - Cells UMAP split by selected biological condition, optionally - subsetted to the specific cluster or cell type (rnaumap dim. - reduction). + Cells UMAP split by selected criteria, + optionally subsetted to the specific + group (rnaumap dim. reduction). PDF format umap_rd_atacumap_plot_png: @@ -317,9 +350,9 @@ outputs: outputBinding: glob: "*_umap_rd_atacumap.png" doc: | - Cells UMAP split by selected biological condition, optionally - subsetted to the specific cluster or cell type (atacumap dim. - reduction). + Cells UMAP split by selected criteria, + optionally subsetted to the specific + group (atacumap dim. reduction). PNG format umap_rd_atacumap_plot_pdf: @@ -327,9 +360,9 @@ outputs: outputBinding: glob: "*_umap_rd_atacumap.pdf" doc: | - Cells UMAP split by selected biological condition, optionally - subsetted to the specific cluster or cell type (atacumap dim. - reduction). + Cells UMAP split by selected criteria, + optionally subsetted to the specific + group (atacumap dim. reduction). PDF format umap_rd_wnnumap_plot_png: @@ -337,9 +370,9 @@ outputs: outputBinding: glob: "*_umap_rd_wnnumap.png" doc: | - Cells UMAP split by selected biological condition, optionally - subsetted to the specific cluster or cell type (wnnumap dim. - reduction). + Cells UMAP split by selected criteria, + optionally subsetted to the specific + group (wnnumap dim. reduction). PNG format umap_rd_wnnumap_plot_pdf: @@ -347,9 +380,9 @@ outputs: outputBinding: glob: "*_umap_rd_wnnumap.pdf" doc: | - Cells UMAP split by selected biological condition, optionally - subsetted to the specific cluster or cell type (wnnumap dim. - reduction). + Cells UMAP split by selected criteria, + optionally subsetted to the specific + group (wnnumap dim. reduction). PDF format mds_plot_html: @@ -357,8 +390,8 @@ outputs: outputBinding: glob: "*_mds_plot.html" doc: | - MDS plot of normalized counts. Optionally batch corrected - if --remove was set to True. + MDS plot of pseudobulk aggregated + normalized reads counts. All genes. HTML format pca_1_2_plot_png: @@ -366,8 +399,8 @@ outputs: outputBinding: glob: "*_pca_1_2.png" doc: | - Normalized counts PCA (PC1 and PC2) subsetted to all DE genes regardless - of Padj, optionally batch corrected by the selected criteria. + Normalized reads counts PCA (1, 2). + All genes. PNG format pca_1_2_plot_pdf: @@ -375,8 +408,8 @@ outputs: outputBinding: glob: "*_pca_1_2.pdf" doc: | - Normalized counts PCA (PC1 and PC2) subsetted to all DE genes regardless - of Padj, optionally batch corrected by the selected criteria. + Normalized reads counts PCA (1, 2). + All genes. PDF format pca_2_3_plot_png: @@ -384,8 +417,8 @@ outputs: outputBinding: glob: "*_pca_2_3.png" doc: | - Normalized counts PCA (PC2 and PC3) subsetted to all DE genes regardless - of Padj, optionally batch corrected by the selected criteria. + Normalized reads counts PCA (2, 3). + All genes. PNG format pca_2_3_plot_pdf: @@ -393,8 +426,8 @@ outputs: outputBinding: glob: "*_pca_2_3.pdf" doc: | - Normalized counts PCA (PC2 and PC3) subsetted to all DE genes regardless - of Padj, optionally batch corrected by the selected criteria. + Normalized reads counts PCA (2, 3). + All genes. PDF format dxpr_vlcn_plot_png: @@ -402,11 +435,13 @@ outputs: outputBinding: glob: "*_dxpr_vlcn.png" doc: | - Volcano plot of differentially expressed genes. Highlighed genes are either - provided by user or top 10 genes with the highest log2FC values. The direction - of comparison is defined by --second vs --first groups of cells optionally - subsetted to the specific cluster or cell type and coerced to the pseudobulk - RNA-Seq samples. + Volcano plot of differentially expressed genes. + Highlighed genes are either provided by user or + top 10 genes with the highest log2FoldChange + values. The direction of comparison is defined + as --second vs --first. Cells are optionally + subsetted to the specific group and optionally + coerced to the pseudobulk form. PNG format dxpr_vlcn_plot_pdf: @@ -414,35 +449,37 @@ outputs: outputBinding: glob: "*_dxpr_vlcn.pdf" doc: | - Volcano plot of differentially expressed genes. Highlighed genes are either - provided by user or top 10 genes with the highest log2FC values. The direction - of comparison is defined by --second vs --first groups of cells optionally - subsetted to the specific cluster or cell type and coerced to the pseudobulk - RNA-Seq samples. + Volcano plot of differentially expressed genes. + Highlighed genes are either provided by user or + top 10 genes with the highest log2FoldChange + values. The direction of comparison is defined + as --second vs --first. Cells are optionally + subsetted to the specific group and optionally + coerced to the pseudobulk form. PDF format xpr_dnst_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputBinding: - glob: "*_xpr_dnst_*.png" + glob: "*_xpr_dnst.png" doc: | - Log normalized gene expression density per dataset optionally subsetted to the - specific cluster or cell type. + Log normalized gene expression density plots for + either user provided or top 10 differentially + expressed genes with the highest log2FoldChange + values. The direction of comparison is defined + as --second vs --first. PNG format xpr_dnst_plot_pdf: - type: - - "null" - - type: array - items: File + type: File? outputBinding: - glob: "*_xpr_dnst_*.pdf" + glob: "*_xpr_dnst.pdf" doc: | - Log normalized gene expression density per dataset optionally subsetted to the - specific cluster or cell type. + Log normalized gene expression density plots for + either user provided or top 10 differentially + expressed genes with the highest log2FoldChange + values. The direction of comparison is defined + as --second vs --first. PDF format xpr_per_cell_rd_rnaumap_plot_png: @@ -453,8 +490,9 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_rnaumap_*.png" doc: | - Log normalized gene expression on cells UMAP per dataset optionally subsetted - to the specific cluster or cell type (rnaumap dim. reduction). + Log normalized gene expression on cells UMAP + split by selected criteria, optionally subsetted + to the specific group (rnaumap dim. reduction). PNG format xpr_per_cell_rd_rnaumap_plot_pdf: @@ -465,8 +503,9 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_rnaumap_*.pdf" doc: | - Log normalized gene expression on cells UMAP per dataset optionally subsetted - to the specific cluster or cell type (rnaumap dim. reduction). + Log normalized gene expression on cells UMAP + split by selected criteria, optionally subsetted + to the specific group (rnaumap dim. reduction). PDF format xpr_per_cell_rd_atacumap_plot_png: @@ -477,8 +516,9 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_atacumap_*.png" doc: | - Log normalized gene expression on cells UMAP per dataset optionally subsetted - to the specific cluster or cell type (atacumap dim. reduction). + Log normalized gene expression on cells UMAP + split by selected criteria, optionally subsetted + to the specific group (atacumap dim. reduction). PNG format xpr_per_cell_rd_atacumap_plot_pdf: @@ -489,8 +529,9 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_atacumap_*.pdf" doc: | - Log normalized gene expression on cells UMAP per dataset optionally subsetted - to the specific cluster or cell type (atacumap dim. reduction). + Log normalized gene expression on cells UMAP + split by selected criteria, optionally subsetted + to the specific group (atacumap dim. reduction). PDF format xpr_per_cell_rd_wnnumap_plot_png: @@ -501,8 +542,9 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_wnnumap_*.png" doc: | - Log normalized gene expression on cells UMAP per dataset optionally subsetted - to the specific cluster or cell type (wnnumap dim. reduction). + Log normalized gene expression on cells UMAP + split by selected criteria, optionally subsetted + to the specific group (wnnumap dim. reduction). PNG format xpr_per_cell_rd_wnnumap_plot_pdf: @@ -513,8 +555,9 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_wnnumap_*.pdf" doc: | - Log normalized gene expression on cells UMAP per dataset optionally subsetted - to the specific cluster or cell type (wnnumap dim. reduction). + Log normalized gene expression on cells UMAP + split by selected criteria, optionally subsetted + to the specific group (wnnumap dim. reduction). PDF format xpr_htmp_plot_png: @@ -522,8 +565,9 @@ outputs: outputBinding: glob: "*_xpr_htmp.png" doc: | - Normalized gene expression heatmap optionally subsetted - to the specific cluster or cell type. + Filtered by adjusted P-value normalized gene + expression heatmap per cell optionally subsetted + to the specific group. PNG format xpr_htmp_plot_pdf: @@ -531,8 +575,9 @@ outputs: outputBinding: glob: "*_xpr_htmp.pdf" doc: | - Normalized gene expression heatmap optionally subsetted - to the specific cluster or cell type. + Filtered by adjusted P-value normalized gene + expression heatmap per cell optionally subsetted + to the specific group. PDF format diff_expr_genes: @@ -540,26 +585,36 @@ outputs: outputBinding: glob: "*_de_genes.tsv" doc: | - Differentially expressed genes. + Differentially expressed genes. Not filtered + by adjusted P-value. TSV format - read_counts_gct: + bulk_read_counts_gct: type: File? outputBinding: - glob: "*_norm_read_counts.gct" + glob: "*_bulk_counts.gct" doc: | - GSEA compatible normalized counts, optionally, batch corrected. + GSEA compatible not filtered normalized reads + counts aggregated to pseudobulk form. GCT format - phenotypes_cls: + bulk_phenotypes_cls: type: File? outputBinding: - glob: "*_phenotypes.cls" + glob: "*_bulk_phntps.cls" doc: | - GSEA compatible phenotypes file defined based on --splitby, --first, - and --second parameters. + GSEA compatible phenotypes file defined based + on --splitby, --first, and --second parameters. CLS format + cell_read_counts_gct: + type: File? + outputBinding: + glob: "*_cell_counts.gct" + doc: | + Filtered normalized reads counts per cell. + GCT format + stdout_log: type: stdout @@ -580,9 +635,9 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell Pseudobulk Differential Expression Analysis Between Datasets" -s:name: "Single-cell Pseudobulk Differential Expression Analysis Between Datasets" -s:alternateName: "Identifies differentially expressed genes between groups of cells coerced to pseudobulk datasets" +label: "Single-cell Differential Expression Analysis" +s:name: "Single-cell Differential Expression Analysis" +s:alternateName: "Identifies differentially expressed genes between two groups of cells optionally coerced to pseudobulk form" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-rna-de-pseudobulk.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -620,26 +675,27 @@ s:creator: doc: | - Single-cell Pseudobulk Differential Expression Analysis Between Datasets + Single-cell Differential Expression Analysis - Identifies differentially expressed genes between groups - of cells coerced to pseudobulk datasets. + Identifies differentially expressed genes between two + groups of cells optionally coerced to pseudobulk form s:about: | usage: sc_rna_de_pseudobulk.R - [-h] --query QUERY [--metadata METADATA] --splitby SPLITBY --first - FIRST --second SECOND [--batchby BATCHBY] [--groupby GROUPBY] - [--subset [SUBSET ...]] [--lrt] [--padj PADJ] [--genes [GENES ...]] - [--exclude EXCLUDE] [--norm {vst,rlog}] [--remove] - [--cluster {row,column,both}] + [-h] --query QUERY [--metadata METADATA] [--barcodes BARCODES] + [--groupby GROUPBY] [--subset [SUBSET ...]] --splitby SPLITBY --first + FIRST --second SECOND + [--test {wilcoxon,likelihood-ratio,t-test,negative-binomial,poisson,logistic-regression,mast,deseq,deseq-lrt}] + [--batchby BATCHBY] [--padj PADJ] [--genes [GENES ...]] + [--exclude EXCLUDE] [--cluster {row,column,both}] [--rowdist {cosangle,abscosangle,euclid,abseuclid,cor,abscor}] [--columndist {cosangle,abscosangle,euclid,abseuclid,cor,abscor}] [--center] [--pdf] [--verbose] [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] - Single-cell Pseudobulk Differential Expression Analysis Between Datasets + Single-cell Differential Expression Analysis options: -h, --help show this help message and exit @@ -655,73 +711,81 @@ s:about: | column of the loaded Seurat object. If any of the provided in this file columns are already present in the Seurat object metadata, they will be overwritten. - Default: no extra metadata is added - --splitby SPLITBY Column from the Seurat object metadata to split - datasets into two groups to run --second vs --first - pseudobulk DE analysis, i.e., calculate log2FC. May be - one of the columns from the extra metadata added with - --metadata parameter. Provided value should group the - datasets, not cells, therefore do not use a column - with clustering results. - --first FIRST Value from the Seurat object metadata column set with - --splitby to define the first group of datasets for - pseudobulk DE analysis. - --second SECOND Value from the Seurat object metadata column set with - --splitby to define the second group of datasets for - pseudobulk DE analysis. - --batchby BATCHBY Column from the Seurat object metadata to group - datasets into batches. It will be used as a factor - variable to model batch effect when running pseudobulk - DE analysis (makes design formula look like - ~splitby+batchby). May be one of the columns from the - extra metadata added with --metadata parameter. - Provided value should batch the datasets, not cells, - therefore do not use a column with clustering results. - Default: do not model batch effect. + When combined with --barcodes parameter, first the + metadata will be extended, then barcode filtering will + be applied. Default: no extra metadata is added + --barcodes BARCODES Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata by selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. Default: all cells used, no extra + metadata is added --groupby GROUPBY Column from the Seurat object metadata to group cells for optional subsetting when combined with --subset - parameter. May be one of the columns from the extra - metadata added with --metadata parameter. Ignored if - --subset is not set. Provided value defines the groups - of cells, therefore any metadata column, including the - clustering results, may be used. Default: do not - subset, run pseudobulk DE analysis for all cells - jointly + parameter. May be one of the extra metadata columns + added with --metadata or --barcodes parameters. + Ignored if --subset is not set. Default: do not + subset, include all cells into analysis. --subset [SUBSET ...] - Value(s) from the column set with --groupby parameter - to subset cells before running pseudobulk DE analysis. - If multiple values are provided run analysis jointly - for selected groups of cells. Ignored if --groupby is - not set. Default: do not subset, run pseudobulk DE - analysis for all cells jointly - --lrt Use LRT instead of the pair-wise Wald test. If - --batchby is not provided use ~1 as a reduced formula, - otherwise ~batchby. Default: use Wald test + Values from the column set with --groupby parameter to + subset cells before running differential expression + analysis. Ignored if --groupby is not provided. + Default: do not subset cells, include all of them. + --splitby SPLITBY Column from the Seurat object metadata to split cells + into two groups to run --second vs --first + differential expression analysis. May be one of the + extra metadata columns added with --metadata or + --barcodes parameters. + --first FIRST Value from the Seurat object metadata column set with + --splitby parameter to define the first group of cells + for differential expression analysis. + --second SECOND Value from the Seurat object metadata column set with + --splitby parameter to define the second group of + cells for differential expression analysis. + --test {wilcoxon,likelihood-ratio,t-test,negative-binomial,poisson,logistic-regression,mast,deseq,deseq-lrt} + Test type to use in differential expression analysis. + If set to deseq or deseq-lrt, gene expression will be + aggregated to the pseudobulk level per dataset. For + deseq, the pair-wise Wald test will be used. For + deseq-lrt, the reduced formula will look like ~1 if + --batchby parameter is omitted or will be set to + ~batchby to exclude the criteria if interest (defined + by --splitby). For all other values of the --test + parameter the FindMarkers function will be used (genes + will be prefiltered by minimum percentage >= 0.1 and + by minimum log2FoldChange >= 0.25 before running + differential expression analysis). Default: use + FindMarkers with Wilcoxon Rank Sum test. + --batchby BATCHBY Column from the Seurat object metadata to group cells + into batches. If --test is set to deseq or deseq-lrt + the --batchby parameter will be used in the design + formula in the following way ~splitby+batchby. If + --test is set to negative-binomial, poisson, logistic- + regression, or mast it will be used as a latent + variable in the FindMarkers function. Not supported + for --test values equal to wilcoxon, likelihood-ratio, + or t-test. May be one of the extra metadata columns + added with --metadata or --barcodes parameters. + Default: do not model batch effect. --padj PADJ In the exploratory visualization part of the analysis - output only features with adjusted P-value not bigger - than this value. Default: 0.05 + output only differentially expressed genes with + adjusted P-value not bigger than this value. Default: + 0.05 --genes [GENES ...] Genes of interest to label on the generated plots. Default: top 10 genes with the highest and the lowest - log2FC expression values. - --exclude EXCLUDE Regex pattern to identify and exclude non-coding RNA - genes from the pseudobulk DE analysis (not case- - sensitive). If any of such genes were provided in the + log2FoldChange values. + --exclude EXCLUDE Regex pattern to identify and exclude specific genes + from the differential expression analysis (not case- + sensitive). If any of such genes are provided in the --genes parameter, they will be excluded from there as well. Default: use all genes - --norm {vst,rlog} Read counts normalization for the exploratory - visualization part of the analysis. Use 'vst' for - medium-to-large datasets (n > 30) and 'rlog' for small - datasets (n < 30), when there is a wide range of - sequencing depth across samples. Default: rlog - --remove Remove batch effect when generating normalized read - counts for the exploratory visualization part of the - analysis. Ignored if --batchby is not provided. - Default: do not remove batch effect from normalized - read counts. --cluster {row,column,both} - Hopach clustering method to be run on normalized read - counts for the exploratory visualization part of the - analysis. Default: do not run clustering + Hopach clustering method to be run on the normalized + read counts for the exploratory visualization part of + the analysis. Clustering by column is supported only + when --test is set to deseq or deseq-lrt. Default: do + not run clustering --rowdist {cosangle,abscosangle,euclid,abseuclid,cor,abscor} Distance metric for HOPACH row clustering. Ignored if --cluster is set to column or not provided. Default: diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index 7f0d0281..7d62dafc 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.15 + dockerPull: biowardrobe2/sc-tools:v0.0.21 inputs: @@ -139,6 +139,34 @@ inputs: genes not bigger than this value. Default: 5 (applied to all datasets) + remove_doublets: + type: boolean? + inputBinding: + prefix: "--removedoublets" + doc: | + Remove cells that were identified as doublets. Cells with + RNA UMI < 200 will not be evaluated. Default: do not remove + doublets + + rna_doublet_rate: + type: float? + inputBinding: + prefix: "--rnadbr" + doc: | + Expected RNA doublet rate. Default: 1 percent per + thousand cells captured with 10x genomics + + rna_doublet_rate_sd: + type: float? + inputBinding: + prefix: "--rnadbrsd" + doc: | + Uncertainty range in the RNA doublet rate, interpreted as + a +/- around the value provided in --rnadbr. Set to 0 to + disable. Set to 1 to make the threshold depend entirely + on the misclassification rate. Default: 40 percents of the + value provided in --rnadbr + export_pdf_plots: type: boolean? inputBinding: @@ -377,6 +405,22 @@ outputs: QC metrics per cell density (not filtered). PDF format + raw_rnadbl_plot_png: + type: File? + outputBinding: + glob: "*_raw_rnadbl.png" + doc: | + Percentage of RNA doublets per dataset (not filtered). + PNG format + + raw_rnadbl_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_rnadbl.pdf" + doc: | + Percentage of RNA doublets per dataset (not filtered). + PDF format + raw_umi_dnst_spl_cnd_plot_png: type: File? outputBinding: @@ -587,6 +631,22 @@ outputs: QC metrics per cell density (filtered). PDF format + fltr_rnadbl_plot_png: + type: File? + outputBinding: + glob: "*_fltr_rnadbl.png" + doc: | + Percentage of RNA doublets per dataset (filtered). + PNG format + + fltr_rnadbl_plot_pdf: + type: File? + outputBinding: + glob: "*_fltr_rnadbl.pdf" + doc: | + Percentage of RNA doublets per dataset (filtered). + PDF format + fltr_umi_dnst_spl_cnd_plot_png: type: File? outputBinding: @@ -775,16 +835,19 @@ s:about: | usage: sc_rna_filter.R [-h] --mex MEX [MEX ...] --identity IDENTITY [--grouping GROUPING] [--barcodes BARCODES] [--rnamincells RNAMINCELLS] - [--mingenes [MINGENES ...]] [--maxgenes [MAXGENES ...]] - [--rnaminumi [RNAMINUMI ...]] [--minnovelty [MINNOVELTY ...]] - [--mitopattern MITOPATTERN] [--maxmt MAXMT] [--pdf] [--verbose] + [--mingenes [MINGENES [MINGENES ...]]] + [--maxgenes [MAXGENES [MAXGENES ...]]] + [--rnaminumi [RNAMINUMI [RNAMINUMI ...]]] + [--minnovelty [MINNOVELTY [MINNOVELTY ...]]] + [--mitopattern MITOPATTERN] [--maxmt MAXMT] [--removedoublets] + [--rnadbr RNADBR] [--rnadbrsd RNADBRSD] [--pdf] [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] Single-cell RNA-Seq Filtering Analysis - options: + optional arguments: -h, --help show this help message and exit --mex MEX [MEX ...] Path to the folder with feature-barcode matrix from Cell Ranger Count/Aggregate experiment in MEX format. @@ -819,26 +882,26 @@ s:about: | cells. Ignored when '--mex' points to the feature- barcode matrices from the multiple Cell Ranger Count experiments. Default: 5 (applied to all datasets) - --mingenes [MINGENES ...] + --mingenes [MINGENES [MINGENES ...]] Include cells where at least this many genes are detected. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. Default: 250 (applied to all datasets) - --maxgenes [MAXGENES ...] + --maxgenes [MAXGENES [MAXGENES ...]] Include cells with the number of genes not bigger than this value. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. Default: 5000 (applied to all datasets) - --rnaminumi [RNAMINUMI ...] + --rnaminumi [RNAMINUMI [RNAMINUMI ...]] Include cells where at least this many UMI (transcripts) are detected. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. Default: 500 (applied to all datasets) - --minnovelty [MINNOVELTY ...] + --minnovelty [MINNOVELTY [MINNOVELTY ...]] Include cells with the novelty score not lower than this value, calculated for as log10(genes)/log10(UMI). If multiple values provided, each of them will be @@ -851,6 +914,16 @@ s:about: | --maxmt MAXMT Include cells with the percentage of transcripts mapped to mitochondrial genes not bigger than this value. Default: 5 (applied to all datasets) + --removedoublets Remove cells that were identified as doublets. Cells + with RNA UMI < 200 will not be evaluated. Default: do + not remove doublets + --rnadbr RNADBR Expected RNA doublet rate. Default: 1 percent per + thousand cells captured with 10x genomics + --rnadbrsd RNADBRSD Uncertainty range in the RNA doublet rate, interpreted + as a +/- around the value provided in --rnadbr. Set to + 0 to disable. Set to 1 to make the threshold depend + entirely on the misclassification rate. Default: 40 + percents of the value provided in --rnadbr --pdf Export plots in PDF. Default: false --verbose Print debug information. Default: false --h5seurat Save Seurat data to h5seurat file. Default: false diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index 573c900b..33124f42 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.15 + dockerPull: biowardrobe2/sc-tools:v0.0.21 inputs: @@ -165,11 +165,13 @@ inputs: inputBinding: prefix: "--dimensions" doc: | - Dimensionality to use in UMAP projection (from 1 to 50). If single value N - is provided, use from 1 to N PCs. If multiple values are provided, subset to - only selected PCs. In combination with --ntgr set to harmony, selected principle - components will be used in Harmony integration. - Default: from 1 to 10 + Dimensionality to use in UMAP projection (from 1 to + 50). If single value N is provided, use from 1 to N + PCs. If multiple values are provided, subset to only + specified PCs. In combination with --ntgr set to + harmony, multiple values will result in using all + principal components starting from 1 to the max of the + provided values. Default: from 1 to 10 umap_spread: type: float? @@ -301,6 +303,14 @@ inputs: Save Seurat data to h5ad file. Default: false + export_scope_data: + type: boolean? + inputBinding: + prefix: "--scope" + doc: | + Save Seurat data to SCope compatible loom file. + Default: false + export_ucsc_cb: type: boolean? inputBinding: @@ -668,6 +678,13 @@ outputs: doc: | Reduced Seurat data in h5ad format + seurat_data_scope: + type: File? + outputBinding: + glob: "*_data.loom" + doc: | + Reduced Seurat data in SCope compatible loom format + stdout_log: type: stdout @@ -737,20 +754,22 @@ s:about: | usage: sc_rna_reduce.R [-h] --query QUERY [--metadata METADATA] [--barcodes BARCODES] [--cellcycle CELLCYCLE] [--norm {sct,log,sctglm}] - [--ntgr {seurat,harmony,none}] [--ntgrby [NTGRBY ...]] + [--ntgr {seurat,harmony,none}] [--ntgrby [NTGRBY [NTGRBY ...]]] [--highvargenes HIGHVARGENES] [--regressmt] - [--regressgenes [REGRESSGENES ...]] [--regressccfull | --regressccdiff] - [--dimensions [DIMENSIONS ...]] [--uspread USPREAD] + [--regressgenes [REGRESSGENES [REGRESSGENES ...]]] + [--regressccfull | --regressccdiff] + [--dimensions [DIMENSIONS [DIMENSIONS ...]]] [--uspread USPREAD] [--umindist UMINDIST] [--uneighbors UNEIGHBORS] [--umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis,mahalanobis,wminkowski,seuclidean,cosine,correlation,haversine,hamming,jaccard,dice,russelrao,kulsinski,ll_dirichlet,hellinger,rogerstanimoto,sokalmichener,sokalsneath,yule}] [--umethod {uwot,uwot-learn,umap-learn}] [--pdf] [--verbose] - [--h5seurat] [--h5ad] [--cbbuild] [--lowmem] [--output OUTPUT] + [--h5seurat] [--h5ad] [--scope] [--cbbuild] [--lowmem] + [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] Single-cell RNA-Seq Dimensionality Reduction Analysis - options: + optional arguments: -h, --help show this help message and exit --query QUERY Path to the RDS file to load Seurat object from. This file should include genes expression information @@ -790,7 +809,7 @@ s:about: | Integration method used for joint analysis of multiple datasets. Automatically set to 'none' if loaded Seurat object includes only one dataset. Default: seurat - --ntgrby [NTGRBY ...] + --ntgrby [NTGRBY [NTGRBY ...]] Column(s) from the Seurat object metadata to define the variable(s) that should be integrated out when running multiple datasets integration with harmony. @@ -804,7 +823,7 @@ s:about: | --regressmt Regress the percentage of transcripts mapped to mitochondrial genes as a confounding source of variation. Default: false - --regressgenes [REGRESSGENES ...] + --regressgenes [REGRESSGENES [REGRESSGENES ...]] Genes which expression should be regressed as a confounding source of variation. Default: None --regressccfull Regress all signals associated with cell cycle phase. @@ -816,13 +835,14 @@ s:about: | and cycling cells will be maintained. Ignored if --cellcycle is not provided. Mutually exclusive with --regressccfull Default: false - --dimensions [DIMENSIONS ...] + --dimensions [DIMENSIONS [DIMENSIONS ...]] Dimensionality to use in UMAP projection (from 1 to 50). If single value N is provided, use from 1 to N PCs. If multiple values are provided, subset to only - selected PCs. In combination with --ntgr set to - harmony, selected principle components will be used in - Harmony integration. Default: from 1 to 10 + specified PCs. In combination with --ntgr set to + harmony, multiple values will result in using all + principal components starting from 1 to the max of the + provided values. Default: from 1 to 10 --uspread USPREAD The effective scale of embedded points on UMAP. In combination with '--mindist' it determines how clustered/clumped the embedded points are. Default: 1 @@ -848,6 +868,8 @@ s:about: | --verbose Print debug information. Default: false --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save Seurat data to h5ad file. Default: false + --scope Save Seurat data to SCope compatible loom file. + Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --lowmem Attempts to minimize RAM usage when integrating multiple datasets with SCTransform algorithm (slows diff --git a/tools/sc-split-atac.cwl b/tools/sc-split-atac.cwl new file mode 100644 index 00000000..73145a8c --- /dev/null +++ b/tools/sc-split-atac.cwl @@ -0,0 +1,147 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/sc-split-atac:v0.0.1 + + +inputs: + + atac_fragments_file: + type: File + inputBinding: + prefix: "--fragments" + doc: | + Path to GZIP compressed TSV file with ATAC fragments (from Cell Ranger ARC) + + clusters_metadata: + type: File + inputBinding: + prefix: "--clusters" + doc: | + Path to headerless TSV file with barcodes (first column) and + clusters (second column) + + log_level: + type: + - "null" + - type: enum + symbols: + - "fatal" + - "error" + - "warning" + - "info" + - "debug" + inputBinding: + prefix: "--loglevel" + doc: | + Logging level. + Default: info + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output file prefix. + Default: ./split + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of processes to run in parallel. + Default: 1 + + +outputs: + + atac_fragments_per_cluster_file: + type: File[] + outputBinding: + glob: "*.bed" + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["sc_split_atac.py"] + +stdout: sc_split_atac_stdout.log +stderr: sc_split_atac_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Single-cell Split ATAC Fragments" +s:name: "Single-cell Split ATAC Fragments" +s:alternateName: "Splits scATAC fragments produced by Cell Ranger ARC Count/Aggregate pipelines" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-split-atac.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell Split ATAC Fragments + ============================================================================= + Splits scATAC fragments produced by Cell Ranger ARC Count/Aggregate pipelines + + +s:about: | + usage: sc_split_atac.py [-h] --fragments FRAGMENTS --clusters CLUSTERS [--cpus CPUS] [--loglevel {fatal,error,warning,info,debug}] [--output OUTPUT] + + optional arguments: + -h, --help show this help message and exit + --fragments FRAGMENTS + Path to GZIP compressed TSV file with ATAC fragments (from Cell Ranger ARC) + --clusters CLUSTERS Path to headerless TSV file with barcodes (first column) and clusters (second column) + --cpus CPUS Number of processes to run in parallel + --loglevel {fatal,error,warning,info,debug} + Logging level. Default: info + --output OUTPUT Output file prefix diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index bb0172d6..61fb9723 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.15 + dockerPull: biowardrobe2/sc-tools:v0.0.21 inputs: @@ -409,7 +409,7 @@ s:about: | Single-cell Label Integration Analysis - options: + optional arguments: -h, --help show this help message and exit --query QUERY Path to the RDS file to load Seurat object from. This file should include genes expression and/or chromatin diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index d564ee83..861bd510 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.15 + dockerPull: biowardrobe2/sc-tools:v0.0.21 inputs: @@ -346,6 +346,15 @@ inputs: Save Seurat data to h5ad file. Default: false + export_scope_data: + type: boolean? + inputBinding: + prefix: "--scope" + doc: | + Save Seurat data to SCope compatible loom file. + Only not normalized raw counts from the RNA assay + will be saved. Default: false + export_ucsc_cb: type: boolean? inputBinding: @@ -792,6 +801,13 @@ outputs: doc: | Reduced Seurat data in h5ad format + seurat_data_scope: + type: File? + outputBinding: + glob: "*_data.loom" + doc: | + Reduced Seurat data in SCope compatible loom format + stdout_log: type: stdout @@ -860,15 +876,16 @@ doc: | s:about: | usage: sc_wnn_cluster.R - [-h] --query QUERY [--rnadimensions [RNADIMENSIONS ...]] - [--atacdimensions [ATACDIMENSIONS ...]] + [-h] --query QUERY + [--rnadimensions [RNADIMENSIONS [RNADIMENSIONS ...]]] + [--atacdimensions [ATACDIMENSIONS [ATACDIMENSIONS ...]]] [--algorithm {louvain,mult-louvain,slm,leiden}] [--uspread USPREAD] [--umindist UMINDIST] [--uneighbors UNEIGHBORS] [--umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis,mahalanobis,wminkowski,seuclidean,cosine,correlation,haversine,hamming,jaccard,dice,russelrao,kulsinski,ll_dirichlet,hellinger,rogerstanimoto,sokalmichener,sokalsneath,yule}] [--umethod {uwot,uwot-learn,umap-learn}] - [--resolution [RESOLUTION ...]] [--fragments FRAGMENTS] - [--genes [GENES ...]] [--diffgenes] [--diffpeaks] [--rnalogfc RNALOGFC] - [--rnaminpct RNAMINPCT] [--rnaonlypos] + [--resolution [RESOLUTION [RESOLUTION ...]]] [--fragments FRAGMENTS] + [--genes [GENES [GENES ...]]] [--diffgenes] [--diffpeaks] + [--rnalogfc RNALOGFC] [--rnaminpct RNAMINPCT] [--rnaonlypos] [--rnatestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] [--ataclogfc ATACLOGFC] [--atacminpct ATACMINPCT] [--atactestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] @@ -878,7 +895,7 @@ s:about: | Single-cell WNN Cluster Analysis - options: + optional arguments: -h, --help show this help message and exit --query QUERY Path to the RDS file to load Seurat object from. This file should include genes expression and chromatin @@ -886,14 +903,14 @@ s:about: | assays correspondingly. Additionally, 'pca', 'rnaumap', 'atac_lsi' and 'atacumap' dimensionality reductions should be present. - --rnadimensions [RNADIMENSIONS ...] + --rnadimensions [RNADIMENSIONS [RNADIMENSIONS ...]] Dimensionality from the 'pca' reduction to use when constructing weighted nearest-neighbor graph before clustering (from 1 to 50). If single value N is provided, use from 1 to N dimensions. If multiple values are provided, subset to only selected dimensions. Default: from 1 to 10 - --atacdimensions [ATACDIMENSIONS ...] + --atacdimensions [ATACDIMENSIONS [ATACDIMENSIONS ...]] Dimensionality from the 'atac_lsi' reduction to use when constructing weighted nearest-neighbor graph before clustering (from 1 to 50). If single value N is @@ -924,7 +941,7 @@ s:about: | --umethod {uwot,uwot-learn,umap-learn} UMAP implementation to run. If set to 'umap-learn' use --umetric 'correlation' Default: uwot - --resolution [RESOLUTION ...] + --resolution [RESOLUTION [RESOLUTION ...]] Clustering resolution applied to the constructed weighted nearest-neighbor graph. Can be set as an array but only the first item from the list will be @@ -936,7 +953,8 @@ s:about: | Count and barcode information for every ATAC fragment used in the loaded Seurat object. File should be saved in TSV format with tbi-index file. - --genes [GENES ...] Genes of interest to build gene expression and Tn5 + --genes [GENES [GENES ...]] + Genes of interest to build gene expression and Tn5 insertion frequency plots for the nearest peaks. If ' --fragments' is not provided only gene expression plots will be built. Default: None diff --git a/workflows/cellranger-atac-aggr.cwl b/workflows/cellranger-atac-aggr.cwl new file mode 100644 index 00000000..80aaa8a5 --- /dev/null +++ b/workflows/cellranger-atac-aggr.cwl @@ -0,0 +1,362 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: +- class: SubworkflowFeatureRequirement +- class: StepInputExpressionRequirement +- class: InlineJavascriptRequirement +- class: MultipleInputFeatureRequirement + + +'sd:upstream': + sc_atacseq_sample: + - "cellranger-atac-count.cwl" + genome_indices: + - "cellranger-mkref.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name/Alias" + sd:preview: + position: 1 + + gem_well_labels: + type: string[] + label: "Cell Ranger ATAC Count Experiment" + doc: | + Array of GEM well identifiers to be used for labeling purposes only. + If not provided use rootnames of files from the barcode_metrics_report + input + 'sd:upstreamSource': "sc_atacseq_sample/alias" + 'sd:localLabel': true + + fragments_file_from_count: + type: File[] + secondaryFiles: + - .tbi + label: "Cell Ranger ATAC Count Experiment" + doc: | + Array of files containing count and barcode information for + every ATAC fragment observed in the "cellranger-atac count" + experiment in TSV format. + 'sd:upstreamSource': "sc_atacseq_sample/atac_fragments_file" + + barcode_metrics_report_from_count: + type: File[] + label: "Cell Ranger ATAC Count Experiment" + doc: | + Array of files with per-barcode fragment counts & metrics + produced by "cellranger-atac count" command in CSV format + 'sd:upstreamSource': "sc_atacseq_sample/barcode_metrics_report" + + indices_folder: + type: Directory + label: "Genome Type" + doc: "Cell Ranger ARC generated genome indices folder" + 'sd:upstreamSource': "genome_indices/arc_indices_folder" + 'sd:localLabel': true + + normalization_mode: + type: + - "null" + - type: enum + symbols: ["none", "depth"] + default: "none" + label: "Library depth normalization mode" + doc: "Library depth normalization mode" + 'sd:layout': + advanced: true + + threads: + type: int? + default: 4 + label: "Number of threads" + doc: "Number of threads for those steps that support multithreading" + 'sd:layout': + advanced: true + + memory_limit: + type: int? + default: 20 + label: "Maximum memory used (GB)" + doc: "Maximum memory used (GB). The same will be applied to virtual memory" + 'sd:layout': + advanced: true + + +outputs: + + web_summary_report: + type: File + outputSource: aggregate_counts/web_summary_report + label: "Run summary metrics and charts in HTML format" + doc: | + Run summary metrics and charts in HTML format + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + metrics_summary_report_json: + type: File + outputSource: aggregate_counts/metrics_summary_report_json + label: "Run summary metrics in JSON format" + doc: | + Run summary metrics in JSON format + + metrics_summary_report_csv: + type: File + outputSource: aggregate_counts/metrics_summary_report_csv + label: "Run summary metrics in CSV format" + doc: | + Run summary metrics in CSV format + + barcode_metrics_report: + type: File + outputSource: aggregate_counts/barcode_metrics_report + label: "Per-barcode fragment counts & metrics in CSV format" + doc: | + Per-barcode fragment counts & metrics in CSV format + + atac_fragments_file: + type: File + outputSource: aggregate_counts/fragments_file + label: "Count and barcode information for every ATAC fragment" + doc: | + Count and barcode information for every ATAC fragment observed + in the aggregated experiment in TSV format + + peaks_bed_file: + type: File + outputSource: aggregate_counts/peaks_bed_file + label: "Locations of open-chromatin regions identified in the aggregated experiment" + doc: | + Locations of open-chromatin regions identified in the + aggregated experiment (these regions are referred to + as "peaks") + + peak_annotation_file: + type: File + outputSource: aggregate_counts/peak_annotation_file + label: "Annotations of peaks based on genomic proximity alone" + doc: | + Annotations of peaks based on genomic proximity alone + + secondary_analysis_report_folder: + type: File + outputSource: compress_secondary_analysis_report_folder/compressed_folder + label: "Folder with secondary analysis results" + doc: | + Folder with secondary analysis results + + filtered_feature_bc_matrix_folder: + type: File + outputSource: compress_filtered_feature_bc_matrix_folder/compressed_folder + label: "Compressed folder with aggregated filtered peak-barcode matrices" + doc: | + Folder with aggregated filtered peak-barcode matrices + containing only cellular barcodes in MEX format. + + filtered_feature_bc_matrix_h5: + type: File + outputSource: aggregate_counts/filtered_feature_bc_matrix_h5 + label: "Aggregated filtered peak-barcode matrices in HDF5 format" + doc: | + Aggregated filtered peak-barcode matrices containing + only cellular barcodes in HDF5 format. + + # filtered_tf_bc_matrix_folder: + # type: File + # outputSource: compress_filtered_tf_bc_matrix_folder/compressed_folder + # label: "Compressed folder with aggregated filtered tf-barcode matrices" + # doc: | + # Folder with aggregated filtered tf-barcode matrices + # containing only cellular barcodes in MEX format. + + # filtered_tf_bc_matrix_h5: + # type: File + # outputSource: aggregate_counts/filtered_tf_bc_matrix_h5 + # label: "Aggregated filtered tf-barcode matrices in HDF5 format" + # doc: | + # Aggregated filtered tf-barcode matrices containing + # only cellular barcodes in HDF5 format. + + loupe_browser_track: + type: File + outputSource: aggregate_counts/loupe_browser_track + label: "Loupe Browser visualization and analysis file for aggregated results" + doc: | + Loupe Browser visualization and analysis file for aggregated results + + aggregation_metadata: + type: File + outputSource: aggregate_counts/aggregation_metadata + label: "Aggregation metadata in CSV format" + doc: | + Aggregation metadata in CSV format + + aggregate_counts_stdout_log: + type: File + outputSource: aggregate_counts/stdout_log + label: "stdout log generated by cellranger-atac aggr" + doc: | + stdout log generated by cellranger-atac aggr + + aggregate_counts_stderr_log: + type: File + outputSource: aggregate_counts/stderr_log + label: "stderr log generated by cellranger-atac aggr" + doc: | + stderr log generated by cellranger-atac aggr + + compressed_html_data_folder: + type: File + outputSource: compress_html_data_folder/compressed_folder + label: "Compressed folder with CellBrowser formatted results" + doc: | + Compressed folder with CellBrowser formatted results + + html_data_folder: + type: Directory + outputSource: cellbrowser_build/html_data + label: "Folder with not compressed CellBrowser formatted results" + doc: | + Folder with not compressed CellBrowser formatted results + + cellbrowser_report: + type: File + outputSource: cellbrowser_build/index_html_file + label: "CellBrowser formatted Cellranger report" + doc: | + CellBrowser formatted Cellranger report + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + +steps: + + aggregate_counts: + run: ../tools/cellranger-atac-aggr.cwl + in: + fragments_file_from_count: fragments_file_from_count + barcode_metrics_report: barcode_metrics_report_from_count + gem_well_labels: gem_well_labels + indices_folder: indices_folder + normalization_mode: normalization_mode + threads: threads + memory_limit: memory_limit + virt_memory_limit: memory_limit + out: + - web_summary_report + - metrics_summary_report_json + - metrics_summary_report_csv + - barcode_metrics_report + - fragments_file + - peaks_bed_file + - peak_annotation_file + - secondary_analysis_report_folder + - filtered_feature_bc_matrix_folder + - filtered_feature_bc_matrix_h5 + # - filtered_tf_bc_matrix_folder + # - filtered_tf_bc_matrix_h5 + - aggregation_metadata + - loupe_browser_track + - stdout_log + - stderr_log + + compress_filtered_feature_bc_matrix_folder: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: aggregate_counts/filtered_feature_bc_matrix_folder + out: + - compressed_folder + + # compress_filtered_tf_bc_matrix_folder: + # run: ../tools/tar-compress.cwl + # in: + # folder_to_compress: aggregate_counts/filtered_tf_bc_matrix_folder + # out: + # - compressed_folder + + compress_secondary_analysis_report_folder: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: aggregate_counts/secondary_analysis_report_folder + out: + - compressed_folder + + cellbrowser_build: + run: ../tools/cellbrowser-build-cellranger-atac.cwl + in: + secondary_analysis_report_folder: aggregate_counts/secondary_analysis_report_folder + filtered_feature_bc_matrix_folder: aggregate_counts/filtered_feature_bc_matrix_folder + aggregation_metadata: aggregate_counts/aggregation_metadata + out: + - html_data + - index_html_file + + compress_html_data_folder: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: cellbrowser_build/html_data + out: + - compressed_folder + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Cellranger ATAC Aggregate" +s:name: "Cellranger ATAC Aggregate" +s:alternateName: "Aggregates outputs from multiple runs of Cell Ranger Count Chromatin Accessibility experiments" + +s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/cellranger-atac-aggr.cwl +s:codeRepository: https://github.com/datirium/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Cellranger ATAC Aggregate + + Aggregates outputs from multiple runs of Cell Ranger Count Chromatin + Accessibility experiments diff --git a/workflows/cellranger-atac-count.cwl b/workflows/cellranger-atac-count.cwl new file mode 100644 index 00000000..fd4f87cb --- /dev/null +++ b/workflows/cellranger-atac-count.cwl @@ -0,0 +1,521 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: InlineJavascriptRequirement + - class: MultipleInputFeatureRequirement + + +'sd:upstream': + genome_indices: + - "cellranger-mkref.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name/Alias" + sd:preview: + position: 1 + + indices_folder: + type: Directory + label: "Genome Type" + doc: "Cell Ranger ARC generated genome indices folder" + 'sd:upstreamSource': "genome_indices/arc_indices_folder" + 'sd:localLabel': true + + fastq_file_r1: + type: + - File + - type: array + items: File + label: "FASTQ file(s) R1 (optionally compressed)" + doc: "FASTQ file(s) R1 (optionally compressed)" + + fastq_file_r2: + type: + - File + - type: array + items: File + label: "FASTQ file(s) R2 (optionally compressed)" + doc: "FASTQ file(s) R2 (optionally compressed)" + + fastq_file_r3: + type: + - File + - type: array + items: File + label: "FASTQ file(s) R3 (optionally compressed)" + doc: "FASTQ file(s) R3 (optionally compressed)" + + force_cells: + type: int? + default: null + label: "Define the top N barcodes with the most fragments overlapping peaks as cells" + doc: | + Define the top N barcodes with the most fragments overlapping + peaks as cells. N must be a positive integer <= 20,000. Please + consult the documentation before using this option + 'sd:layout': + advanced: true + + threads: + type: int? + default: 4 + label: "Number of threads" + doc: "Number of threads for those steps that support multithreading" + 'sd:layout': + advanced: true + + memory_limit: + type: int? + default: 20 + label: "Genome Type" + doc: | + Maximum memory used (GB). + The same as was used for generating indices. + The same will be applied to virtual memory + 'sd:upstreamSource': "genome_indices/memory_limit" + 'sd:localLabel': true + + +outputs: + + fastqc_report_fastq_r1: + type: File + outputSource: run_fastqc_for_fastq_r1/html_file + label: "FastqQC report for FASTQ file R1" + doc: | + FastqQC report for FASTQ file R1 + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + fastqc_report_fastq_r2: + type: File + outputSource: run_fastqc_for_fastq_r2/html_file + label: "FastqQC report for FASTQ file R2" + doc: | + FastqQC report for FASTQ file R2 + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + fastqc_report_fastq_r3: + type: File + outputSource: run_fastqc_for_fastq_r3/html_file + label: "FastqQC report for FASTQ file R3" + doc: | + FastqQC report for FASTQ file R3 + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + web_summary_report: + type: File + outputSource: generate_counts_matrix/web_summary_report + label: "Cell Ranger summary" + doc: | + Run summary metrics and charts in HTML format + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + metrics_summary_report_json: + type: File + outputSource: generate_counts_matrix/metrics_summary_report_json + label: "Run summary metrics in JSON format" + doc: | + Run summary metrics in JSON format + + metrics_summary_report_csv: + type: File + outputSource: generate_counts_matrix/metrics_summary_report_csv + label: "Run summary metrics in CSV format" + doc: | + Run summary metrics in CSV format + + barcode_metrics_report: + type: File + outputSource: generate_counts_matrix/barcode_metrics_report + label: "Per-barcode fragment counts & metrics in CSV format" + doc: | + Per-barcode fragment counts & metrics in CSV format + + possorted_genome_bam_bai: + type: File + outputSource: generate_counts_matrix/possorted_genome_bam_bai + label: "Aligned to the genome indexed reads BAM+BAI files" + doc: | + Indexed position-sorted reads aligned to the genome annotated + with barcode information in BAM format + + atac_fragments_file: + type: File + outputSource: generate_counts_matrix/fragments_file + label: "Count and barcode information for every ATAC fragment in TSV format" + doc: | + Count and barcode information for every ATAC fragment observed + in the experiment in TSV format + + peaks_bed_file: + type: File + outputSource: generate_counts_matrix/peaks_bed_file + label: "Identified peaks in BED format" + doc: | + Locations of open-chromatin regions identified in the + experiment (these regions are referred to as "peaks") + + peak_annotation_file: + type: File + outputSource: generate_counts_matrix/peak_annotation_file + label: "Annotations of peaks based on genomic proximity alone" + doc: | + Annotations of peaks based on genomic proximity alone + + cut_sites_bigwig_file: + type: File + outputSource: generate_counts_matrix/cut_sites_bigwig_file + label: "Smoothed transposition site track in bigWig format" + doc: | + Smoothed transposition site track in bigWig format + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + type: 'wig' + name: "ATAC cut sites" + height: 120 + + # peak_motif_mapping_bed: + # type: File + # outputSource: generate_counts_matrix/peak_motif_mapping_bed + # label: "File with peak-motif associations in BED format" + # doc: | + # File with peak-motif associations in BED format + + filtered_feature_bc_matrix_folder: + type: File + outputSource: compress_filtered_feature_bc_matrix_folder/compressed_folder + label: "Compressed folder with filtered peak-barcode matrices" + doc: | + Folder with filtered peak-barcode matrices containing only + cellular barcodes in MEX format. + + filtered_feature_bc_matrix_h5: + type: File + outputSource: generate_counts_matrix/filtered_feature_bc_matrix_h5 + label: "Filtered peak-barcode matrices in HDF5 format" + doc: | + Filtered peak-barcode matrices containing only cellular + barcodes in HDF5 format. + + # filtered_tf_bc_matrix_folder: + # type: File + # outputSource: compress_filtered_tf_bc_matrix_folder/compressed_folder + # label: "Compressed folder with filtered tf-barcode matrices" + # doc: | + # Folder with filtered tf-barcode matrices containing only cellular + # barcodes in MEX format. + + # filtered_tf_bc_matrix_h5: + # type: File + # outputSource: generate_counts_matrix/filtered_tf_bc_matrix_h5 + # label: "Filtered tf-barcode matrices in HDF5 format" + # doc: | + # Filtered tf-barcode matrices containing only cellular + # barcodes in HDF5 format. + + raw_feature_bc_matrices_folder: + type: File + outputSource: compress_raw_feature_bc_matrices_folder/compressed_folder + label: "Compressed folder with unfiltered peak-barcode matrices" + doc: | + Folder with unfiltered peak-barcode matrices containing + all barcodes in MEX format + + raw_feature_bc_matrices_h5: + type: File + outputSource: generate_counts_matrix/raw_feature_bc_matrices_h5 + label: "Unfiltered peak-barcode matrices in HDF5 format" + doc: | + Unfiltered peak-barcode matrices containing all barcodes + in HDF5 format + + secondary_analysis_report_folder: + type: File + outputSource: compress_secondary_analysis_report_folder/compressed_folder + label: "Compressed folder with secondary analysis results" + doc: | + Folder with secondary analysis results + + loupe_browser_track: + type: File + outputSource: generate_counts_matrix/loupe_browser_track + label: "Loupe Browser visualization and analysis file" + doc: | + Loupe Browser visualization and analysis file + + generate_counts_matrix_stdout_log: + type: File + outputSource: generate_counts_matrix/stdout_log + label: stdout log generated by cellranger-atac count + doc: | + stdout log generated by cellranger-atac count + + generate_counts_matrix_stderr_log: + type: File + outputSource: generate_counts_matrix/stderr_log + label: stderr log generated by cellranger-atac count + doc: | + stderr log generated by cellranger-atac count + + collected_statistics_yaml: + type: File + outputSource: collect_statistics/collected_statistics_yaml + label: "Collected statistics in YAML format" + doc: "Collected statistics in YAML format" + + collected_statistics_md: + type: File + outputSource: collect_statistics/collected_statistics_md + label: "Collected statistics in Markdown format" + doc: "Collected statistics in Markdown format" + 'sd:visualPlugins': + - markdownView: + tab: 'Overview' + + collected_statistics_tsv: + type: File + outputSource: collect_statistics/collected_statistics_tsv + label: "Collected statistics in TSV format" + doc: "Collected statistics in TSV format" + 'sd:visualPlugins': + - tableView: + vertical: true + tab: 'Overview' + + compressed_html_data_folder: + type: File + outputSource: compress_html_data_folder/compressed_folder + label: "Compressed folder with CellBrowser formatted results" + doc: | + Compressed folder with CellBrowser formatted results + + html_data_folder: + type: Directory + outputSource: cellbrowser_build/html_data + label: "Folder with not compressed CellBrowser formatted results" + doc: | + Folder with not compressed CellBrowser formatted results + + cellbrowser_report: + type: File + outputSource: cellbrowser_build/index_html_file + label: "CellBrowser formatted Cellranger report" + doc: | + CellBrowser formatted Cellranger report + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + +steps: + + extract_fastq_r1: + run: ../tools/extract-fastq.cwl + in: + compressed_file: fastq_file_r1 + output_prefix: + default: "read_1" + out: + - fastq_file + + extract_fastq_r2: + run: ../tools/extract-fastq.cwl + in: + compressed_file: fastq_file_r2 + output_prefix: + default: "read_2" + out: + - fastq_file + + extract_fastq_r3: + run: ../tools/extract-fastq.cwl + in: + compressed_file: fastq_file_r3 + output_prefix: + default: "read_3" + out: + - fastq_file + + run_fastqc_for_fastq_r1: + run: ../tools/fastqc.cwl + in: + reads_file: extract_fastq_r1/fastq_file + threads: threads + out: + - html_file + + run_fastqc_for_fastq_r2: + run: ../tools/fastqc.cwl + in: + reads_file: extract_fastq_r2/fastq_file + threads: threads + out: + - html_file + + run_fastqc_for_fastq_r3: + run: ../tools/fastqc.cwl + in: + reads_file: extract_fastq_r3/fastq_file + threads: threads + out: + - html_file + + generate_counts_matrix: + run: ../tools/cellranger-atac-count.cwl + in: + fastq_file_r1: extract_fastq_r1/fastq_file + fastq_file_r2: extract_fastq_r2/fastq_file + fastq_file_r3: extract_fastq_r3/fastq_file + indices_folder: indices_folder + force_cells: force_cells + threads: threads + memory_limit: memory_limit + virt_memory_limit: memory_limit + out: + - web_summary_report + - metrics_summary_report_json + - metrics_summary_report_csv + - barcode_metrics_report + - possorted_genome_bam_bai + - fragments_file + - peaks_bed_file + - peak_annotation_file + - cut_sites_bigwig_file + # - peak_motif_mapping_bed + - filtered_feature_bc_matrix_folder + - filtered_feature_bc_matrix_h5 + # - filtered_tf_bc_matrix_folder + # - filtered_tf_bc_matrix_h5 + - raw_feature_bc_matrices_folder + - raw_feature_bc_matrices_h5 + - secondary_analysis_report_folder + - loupe_browser_track + - stdout_log + - stderr_log + + compress_filtered_feature_bc_matrix_folder: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: generate_counts_matrix/filtered_feature_bc_matrix_folder + out: + - compressed_folder + + # compress_filtered_tf_bc_matrix_folder: + # run: ../tools/tar-compress.cwl + # in: + # folder_to_compress: generate_counts_matrix/filtered_tf_bc_matrix_folder + # out: + # - compressed_folder + + compress_raw_feature_bc_matrices_folder: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: generate_counts_matrix/raw_feature_bc_matrices_folder + out: + - compressed_folder + + compress_secondary_analysis_report_folder: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: generate_counts_matrix/secondary_analysis_report_folder + out: + - compressed_folder + + collect_statistics: + run: ../tools/collect-stats-sc-atac-count.cwl + in: + metrics_summary_report: generate_counts_matrix/metrics_summary_report_csv + out: + - collected_statistics_yaml + - collected_statistics_tsv + - collected_statistics_md + + cellbrowser_build: + run: ../tools/cellbrowser-build-cellranger-atac.cwl + in: + secondary_analysis_report_folder: generate_counts_matrix/secondary_analysis_report_folder + filtered_feature_bc_matrix_folder: generate_counts_matrix/filtered_feature_bc_matrix_folder + out: + - html_data + - index_html_file + + compress_html_data_folder: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: cellbrowser_build/html_data + out: + - compressed_folder + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Cell Ranger ATAC Count" +s:name: "Cell Ranger ATAC Count" +s:alternateName: "Counts reads from a single scATAC-Seq library" + +s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/cellranger-atac-count.cwl +s:codeRepository: https://github.com/datirium/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Cell Ranger ATAC Count + + Counts reads from a single scATAC-Seq library \ No newline at end of file diff --git a/workflows/cellranger-mkref.cwl b/workflows/cellranger-mkref.cwl index ac2587e1..b6ccb189 100644 --- a/workflows/cellranger-mkref.cwl +++ b/workflows/cellranger-mkref.cwl @@ -64,6 +64,13 @@ outputs: doc: | Cell Ranger generated genome indices folder + chrom_length_file: + type: File + outputSource: cellranger_mkref/chrom_length_file + label: "Chromosome length file" + doc: | + Chromosome length file in TSV format + stdout_log: type: File outputSource: cellranger_mkref/stdout_log @@ -113,6 +120,7 @@ steps: default: "cellranger_ref" out: - indices_folder + - chrom_length_file - stdout_log - stderr_log @@ -226,4 +234,5 @@ s:creator: doc: | - Cell Ranger Build Reference Indices \ No newline at end of file + Cell Ranger Build Reference Indices + =================================== \ No newline at end of file diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index b113cbdd..cfa6ff67 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -515,4 +515,5 @@ s:creator: doc: | Single-cell ATAC-Seq Cluster Analysis - Clusters single-cell ATAC-Seq datasets, identifies differentially accessible peaks. \ No newline at end of file + Clusters single-cell ATAC-Seq datasets, identifies differentially + accessible peaks. \ No newline at end of file diff --git a/workflows/sc-atac-coverage.cwl b/workflows/sc-atac-coverage.cwl new file mode 100644 index 00000000..940fff8d --- /dev/null +++ b/workflows/sc-atac-coverage.cwl @@ -0,0 +1,337 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var split_features = function(line) { + function get_unique(value, index, self) { + return self.indexOf(value) === index && value != ""; + } + let splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + + +'sd:upstream': + sc_tools_sample: + - "sc-multiome-filter.cwl" + - "sc-atac-reduce.cwl" + - "sc-atac-cluster.cwl" + - "sc-wnn-cluster.cwl" + - "sc-ctype-assign.cwl" + sc_atac_sample: + - "cellranger-arc-count.cwl" + - "cellranger-arc-aggr.cwl" + - "cellranger-atac-count.cwl" + - "cellranger-atac-aggr.cwl" + genome_indices: + - "genome-indices.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name/alias" + sd:preview: + position: 1 + + query_data_rds: + type: File + label: "Experiment run through any pipeline related Single-cell ATAC-Seq" + doc: | + Path to the RDS file to load Seurat object from. This file + should include chromatin accessibility information stored + in the ATAC assay with a proper seqinfo data. + 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" + 'sd:localLabel': true + + atac_fragments_file: + type: File + secondaryFiles: + - .tbi + label: "Cell Ranger ATAC/ARC Count/Aggregate Experiment" + doc: | + Count and barcode information for every ATAC fragment used in the + loaded Seurat object. File should be saved in TSV format and to be + tbi-indexed. + 'sd:upstreamSource': "sc_atac_sample/atac_fragments_file" + 'sd:localLabel': true + + chrom_length_file: # not used - need it only for IGV + type: File + label: "Genome" + doc: | + Reference genome + 'sd:upstreamSource': "genome_indices/chrom_length" + 'sd:localLabel': true + + splitby: + type: string? + default: "new.ident" + label: "Column(s) from the Seurat object metadata to split cells into groups" + doc: | + Column from the Seurat object metadata to split cells into groups. + May be one of the columns added with --metadata or --barcodes + parameters. Default: split by dataset + + datasets_metadata: + type: File? + label: "Optional TSV/CSV file to extend metadata by dataset" + doc: | + Path to the TSV/CSV file to optionally extend Seurat object metadata with + categorical values using samples identities. First column - 'library_id' + should correspond to all unique values from the 'new.ident' column of the + loaded Seurat object. If any of the provided in this file columns are already + present in the Seurat object metadata, they will be overwritten. When combined + with --barcodes parameter, first the metadata will be extended, then barcode + filtering will be applied. Default: no extra metadata is added + + barcodes_data: + type: File? + label: "Optional TSV/CSV file to prefilter and extend metadata by barcodes. First column should be named as 'barcode'" + doc: | + Path to the TSV/CSV file to optionally prefilter and extend Seurat object + metadata be selected barcodes. First column should be named as 'barcode'. + If file includes any other columns they will be added to the Seurat object + metadata ovewriting the existing ones if those are present. + Default: all cells used, no extra metadata is added + + flank_distance: + type: int? + default: 5 + label: "Distance in bp to flank both start and end of the each fragment in both direction" + doc: | + Distance in bp to flank both start and end of the each fragment in both + direction to generate cut sites coverage. Default: 5 + 'sd:layout': + advanced: true + + parallel_memory_limit: + type: + - "null" + - type: enum + symbols: + - "32" + default: "32" + label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Forced to 32 GB + 'sd:layout': + advanced: true + + vector_memory_limit: + type: + - "null" + - type: enum + symbols: + - "64" + default: "64" + label: "Maximum vector memory in GB allowed to be used by R" + doc: | + Maximum vector memory in GB allowed to be used by R. + Forced to 64 GB + 'sd:layout': + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + default: "1" + label: "Number of cores/cpus to use" + doc: | + Number of cores/cpus to use + Forced to 1 + 'sd:layout': + advanced: true + + +outputs: + + peaks_bigbed_file: + type: File + outputSource: sc_atac_coverage/peaks_bigbed_file + label: "Locations of open-chromatin regions" + doc: | + Locations of open-chromatin regions ("peaks") + in bigBed format + 'sd:visualPlugins': + - igvbrowser: + tab: 'Genome Browser' + id: 'igvbrowser' + type: 'annotation' + format: 'bigbed' + name: "Peaks" + height: 40 + + cut_sites_bigwig_file: + type: + - "null" + - type: array + items: File + outputSource: sc_atac_coverage/cut_sites_bigwig_file + label: "Genome coverage for Tn5 cut sites" + doc: | + Genome coverage calculated for Tn5 cut sites + in bigWig format + 'sd:visualPlugins': + - igvbrowser: + tab: 'Genome Browser' + id: 'igvbrowser' + type: 'wig' + name: "Cut sites coverage" + height: 120 + + fragments_bigwig_file: + type: + - "null" + - type: array + items: File + outputSource: sc_atac_coverage/fragments_bigwig_file + label: "Genome coverage for fragments" + doc: | + Genome coverage calculated for fragments + in bigWig format + 'sd:visualPlugins': + - igvbrowser: + tab: 'Genome Browser' + id: 'igvbrowser' + type: 'wig' + name: "Fragments coverage" + height: 120 + + experiment_info: + type: File + label: "IGV tracks order" + doc: | + Markdown file to explain the tracks order for IGV + outputSource: create_metadata/output_file + 'sd:visualPlugins': + - markdownView: + tab: 'Overview' + + sc_atac_coverage_stdout_log: + type: File + outputSource: sc_atac_coverage/stdout_log + label: "stdout log generated by sc_atac_coverage step" + doc: | + stdout log generated by sc_atac_coverage step + + sc_atac_reduce_stderr_log: + type: File + outputSource: sc_atac_coverage/stderr_log + label: "stderr log generated by sc_atac_coverage step" + doc: | + stderr log generated by sc_atac_coverage step + + +steps: + + sc_atac_coverage: + run: ../tools/sc-atac-coverage.cwl + in: + query_data_rds: query_data_rds + atac_fragments_file: atac_fragments_file + splitby: + source: splitby + valueFrom: $(split_features(self)) + datasets_metadata: datasets_metadata + barcodes_data: barcodes_data + flank_distance: flank_distance + verbose: + default: true + parallel_memory_limit: + source: parallel_memory_limit + valueFrom: $(parseInt(self)) + vector_memory_limit: + source: vector_memory_limit + valueFrom: $(parseInt(self)) + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - peaks_bigbed_file + - cut_sites_bigwig_file + - fragments_bigwig_file + - stdout_log + - stderr_log + + create_metadata: + run: ../tools/custom-bash.cwl + in: + input_file: sc_atac_coverage/fragments_bigwig_file + script: + default: | + #!/bin/bash + set -- "$0" "$@" + echo "| Name | Index |" > experiment_info.md + echo "| :-- | --: |" >> experiment_info.md + j=1 + for i in "${@}"; do + echo "| `basename $i` | $j |" >> experiment_info.md + (( j++ )) + done; + out: + - output_file + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Single-cell ATAC-Seq Genome Coverage" +s:name: "Single-cell ATAC-Seq Genome Coverage" +s:alternateName: "Creates genome coverage bigWig files from the provided fragments file and selected grouping parameters" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-atac-coverage.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell ATAC-Seq Genome Coverage + + Creates genome coverage bigWig files from the provided + fragments file and selected grouping parameters \ No newline at end of file diff --git a/workflows/sc-atac-dbinding.cwl b/workflows/sc-atac-dbinding.cwl new file mode 100644 index 00000000..54095eee --- /dev/null +++ b/workflows/sc-atac-dbinding.cwl @@ -0,0 +1,952 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var split_features = function(line) { + function get_unique(value, index, self) { + return self.indexOf(value) === index && value != ""; + } + let splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + + +'sd:upstream': + sc_tools_sample: + - "sc-atac-cluster.cwl" + - "sc-wnn-cluster.cwl" + - "sc-ctype-assign.cwl" + sc_atac_sample: + - "cellranger-arc-count.cwl" + - "cellranger-arc-aggr.cwl" + - "cellranger-atac-count.cwl" + - "cellranger-atac-aggr.cwl" + genome_indices: + - "genome-indices.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name/alias" + sd:preview: + position: 1 + + query_data_rds: + type: File + label: "Experiment run through any pipeline related Single-cell ATAC-Seq" + doc: | + Path to the RDS file to load Seurat object from. + This file should include chromatin accessibility + information stored in the ATAC assay. Additionally + 'rnaumap', and/or 'atacumap', and/or 'wnnumap' + dimensionality reductions should be present. + 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" + 'sd:localLabel': true + + atac_fragments_file: + type: File + secondaryFiles: + - .tbi + label: "Cell Ranger ATAC/ARC Count/Aggregate Experiment" + doc: | + Count and barcode information for every ATAC fragment + used in the loaded Seurat object. File should be saved + in TSV format with tbi-index file. + tbi-indexed. + 'sd:upstreamSource': "sc_atac_sample/atac_fragments_file" + 'sd:localLabel': true + + genome_type: + type: string + label: "Genome" + doc: | + Reference genome + 'sd:upstreamSource': "genome_indices/genome" + 'sd:localLabel': true + + datasets_metadata: + type: File? + label: "Optional TSV/CSV file to extend metadata by dataset" + doc: | + Path to the TSV/CSV file to optionally extend Seurat + object metadata with categorical values using samples + identities. First column - 'library_id' should + correspond to all unique values from the 'new.ident' + column of the loaded Seurat object. If any of the + provided in this file columns are already present in + the Seurat object metadata, they will be overwritten. + When combined with --barcodes parameter, first the + metadata will be extended, then barcode filtering will + be applied. Default: no extra metadata is added + + barcodes_data: + type: File? + label: "Optional TSV/CSV file to prefilter and extend metadata by barcodes. First column should be named as 'barcode'" + doc: | + Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata by selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. Default: all cells used, no extra + metadata is added + + groupby: + type: string? + default: null + label: "Category to group cells for optional subsetting" + doc: | + Column from the Seurat object metadata to group cells + for optional subsetting when combined with --subset + parameter. May be one of the extra metadata columns + added with --metadata or --barcodes parameters. + Ignored if --subset is not set. Default: do not + subset, include all cells into analysis. + + subset: + type: string? + default: null + label: "List of values to subset cells from the selected category" + doc: | + Values from the column set with --groupby parameter to + subset cells before running differential binding + analysis. Ignored if --groupby is not provided. + Default: do not subset cells, include all of them. + + splitby: + type: string + label: "Category to split cell into two groups" + doc: | + Column from the Seurat object metadata to split cells + into two groups to run --second vs --first + differential binding analysis. May be one of the extra + metadata columns added with --metadata or --barcodes + parameters. + + first_cond: + type: string + label: "Value from the selected category to define the first group of cells" + doc: | + Value from the Seurat object metadata column set with + --splitby parameter to define the first group of cells + for differential binding analysis. + + second_cond: + type: string + label: "Value from the selected category to define the second group of cells" + doc: | + Value from the Seurat object metadata column set with + --splitby parameter to define the second group of + cells for differential binding analysis. + + analysis_method: + type: + - "null" + - type: enum + symbols: + - "negative-binomial" # (negbinom) Negative Binomial Generalized Linear Model (use FindMarkers with peaks from Seurat object) + - "poisson" # (poisson) Poisson Generalized Linear Model (use FindMarkers with peaks from Seurat object) + - "logistic-regression" # (LR) Logistic Regression (use FindMarkers with peaks from Seurat object) + - "mast" # (MAST) MAST package (use FindMarkers with peaks from Seurat object) + - "manorm2" # call peaks for each group with MACS2, run MAnorm2 + default: "logistic-regression" + label: "Test type to use in differential binding analysis" + doc: | + Test type to use in differential binding analysis. For + all tests except manorm2, peaks present in the loaded + Seurat object will be used. If manorm2 test selected, + peaks will be called per group defined by --splitby + parameter. Default: logistic-regression + + maximum_padj: + type: float? + default: 0.05 + label: "Maximum adjusted P-value to show in IGV" + doc: | + In the exploratory visualization part of the analysis + output only differentially bound peaks with adjusted + P-value not bigger than this value. Default: 0.05 + + minimum_logfc: + type: float? + default: 1 + label: "Maximum log2 Fold Change value to show in IGV" + doc: | + In the exploratory visualization part of the analysis + output only differentially bound peaks with log2 Fold + Change not smaller than this value. Default: 1.0 + + blacklist_regions_file: + type: File? + label: "Optional BED file with the genomic blacklist regions (for manorm2)" + doc: | + Path to the optional BED file with the genomic + blacklist regions to be filtered out before running + differential binding analysis. Any reference genomic + bin overlapping a blacklist region will be removed + from the output. Ignored if --test is not set to + manorm2. + + minimum_qvalue: + type: float? + default: 0.05 + label: "Minimum FDR (q-value) cutoff for MACS2 peak detection (for manorm2)" + doc: | + Minimum FDR (q-value) cutoff for MACS2 peak detection. + Ignored if --test is not set to manorm2. Default: 0.05 + 'sd:layout': + advanced: true + + minimum_peak_gap: + type: int? + default: 150 + label: "Minimum distabce between the peaks to be merged (for manorm2)" + doc: | + If a distance between peaks is smaller than the + provided value they will be merged before splitting + them into reference genomic bins of size --binsize. + Ignored if --test is not set to manorm2. Default: 150 + 'sd:layout': + advanced: true + + bin_size: + type: int? + default: 1000 + label: "The size of non-overlapping reference genomic bins (for manorm2)" + doc: | + The size of non-overlapping reference genomic bins + used by MAnorm2 when generating a table of reads + counts per peaks. Ignored if --test is not set to + manorm2. Default: 1000 + 'sd:layout': + advanced: true + + maximum_peaks: + type: int? + default: 0 + label: "The maximum number of the most significant peaks to keep, 0 - keep all (for manorm2)" + doc: | + The maximum number of the most significant (based on + qvalue) peaks to keep from each group of cells when + constructing reference genomic bins. Ignored if --test + is not set to manorm2. Default: keep all peaks + 'sd:layout': + advanced: true + + parallel_memory_limit: + type: + - "null" + - type: enum + symbols: + - "32" + default: "32" + label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Forced to 32 GB + 'sd:layout': + advanced: true + + vector_memory_limit: + type: + - "null" + - type: enum + symbols: + - "64" + default: "64" + label: "Maximum vector memory in GB allowed to be used by R" + doc: | + Maximum vector memory in GB allowed to be used by R. + Forced to 64 GB + 'sd:layout': + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + default: "1" + label: "Number of cores/cpus to use" + doc: | + Number of cores/cpus to use + Forced to 1 + 'sd:layout': + advanced: true + + +outputs: + + umap_rd_rnaumap_plot_png: + type: File? + outputSource: sc_atac_dbinding/umap_rd_rnaumap_plot_png + label: "Cells RNA UMAP split by selected criteria" + doc: | + Cells UMAP split by selected criteria, + optionally subsetted to the specific + group (rnaumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Cells RNA UMAP split by selected criteria' + + umap_rd_atacumap_plot_png: + type: File? + outputSource: sc_atac_dbinding/umap_rd_atacumap_plot_png + label: "Cells ATAC UMAP split by selected criteria" + doc: | + Cells UMAP split by selected criteria, + optionally subsetted to the specific + group (atacumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Cells ATAC UMAP split by selected criteria' + + umap_rd_wnnumap_plot_png: + type: File? + outputSource: sc_atac_dbinding/umap_rd_wnnumap_plot_png + label: "Cells WNN UMAP split by selected criteria" + doc: | + Cells UMAP split by selected criteria, + optionally subsetted to the specific + group (atacumap dim. reduction). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Cells WNN UMAP split by selected criteria' + + dbnd_vlcn_plot_png: + type: File? + outputSource: sc_atac_dbinding/dbnd_vlcn_plot_png + label: "Volcano plot of differentially bound sites" + doc: | + Volcano plot of differentially bound sites. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Volcano plot of differentially bound sites' + + seurat_peaks_bigbed_file: + type: File + outputSource: sc_atac_dbinding/seurat_peaks_bigbed_file + label: "Peaks from the provided Seurat object" + doc: | + Peaks in bigBed format extracted + from the loaded from provided RDS + file Seurat object. + 'sd:visualPlugins': + - igvbrowser: + tab: 'Genome Browser' + id: 'igvbrowser' + type: 'annotation' + format: 'bigbed' + name: "Seurat peaks" + height: 40 + + first_fragments_bigwig_file: + type: File + outputSource: sc_atac_dbinding/first_fragments_bigwig_file + label: "Genome coverage for fragments (first)" + doc: | + Genome coverage in bigWig format calculated + for fragments from the cells that belong to + the group defined by the --first and + --groupby parameters. + 'sd:visualPlugins': + - igvbrowser: + tab: 'Genome Browser' + id: 'igvbrowser' + type: 'wig' + name: "Fragments coverage (first)" + height: 120 + + second_fragments_bigwig_file: + type: File + outputSource: sc_atac_dbinding/second_fragments_bigwig_file + label: "Genome coverage for fragments (second)" + doc: | + Genome coverage in bigWig format calculated + for fragments from the cells that belong to + the group defined by the --second and + --groupby parameters. + 'sd:visualPlugins': + - igvbrowser: + tab: 'Genome Browser' + id: 'igvbrowser' + type: 'wig' + name: "Fragments coverage (second)" + height: 120 + + first_tn5ct_bigwig_file: + type: File? + outputSource: sc_atac_dbinding/first_tn5ct_bigwig_file + label: "Genome coverage for Tn5 cut sites (first)" + doc: | + Genome coverage in bigWig format calculated + for Tn5 cut sites from the cells that belong + to the group defined by the --first and + --groupby parameters. + 'sd:visualPlugins': + - igvbrowser: + tab: 'Genome Browser' + id: 'igvbrowser' + type: 'wig' + name: "Tn5 coverage (first)" + height: 120 + + second_tn5ct_bigwig_file: + type: File? + outputSource: sc_atac_dbinding/second_tn5ct_bigwig_file + label: "Genome coverage for Tn5 cut sites (second)" + doc: | + Genome coverage in bigWig format calculated + for Tn5 cut sites from the cells that belong + to the group defined by the --second and + --groupby parameters. + 'sd:visualPlugins': + - igvbrowser: + tab: 'Genome Browser' + id: 'igvbrowser' + type: 'wig' + name: "Tn5 coverage (second)" + height: 120 + + first_peaks_xls_file: + type: File? + outputSource: sc_atac_dbinding/first_peaks_xls_file + label: "MACS2 report in XLS format (first)" + doc: | + MACS2 report in XLS format for peaks + called from the Tn5 cut sites of the + cells that belong to the group defined + by the --first and --groupby parameters. + + second_peaks_xls_file: + type: File? + outputSource: sc_atac_dbinding/second_peaks_xls_file + label: "MACS2 report in XLS format (second)" + doc: | + MACS2 report in XLS format for peaks + called from the Tn5 cut sites of the + cells that belong to the group defined + by the --second and --groupby parameters. + + first_peaks_bed_file: + type: File? + outputSource: sc_atac_dbinding/first_peaks_bed_file + label: "MACS2 peaks in narrowPeak format (first)" + doc: | + MACS2 peaks in narrowPeak format called + from the Tn5 cut sites of the cells that + belong to the group defined by the --first + and --groupby parameters. + 'sd:visualPlugins': + - igvbrowser: + tab: 'Genome Browser' + id: 'igvbrowser' + type: 'annotation' + name: "Called peaks (first)" + displayMode: "COLLAPSE" + height: 40 + + second_peaks_bed_file: + type: File? + outputSource: sc_atac_dbinding/second_peaks_bed_file + label: "MACS2 peaks in narrowPeak format (second)" + doc: | + MACS2 peaks in narrowPeak format called + from the Tn5 cut sites of the cells that + belong to the group defined by the --second + and --groupby parameters. + 'sd:visualPlugins': + - igvbrowser: + tab: 'Genome Browser' + id: 'igvbrowser' + type: 'annotation' + name: "Called peaks (second)" + displayMode: "COLLAPSE" + height: 40 + + first_summits_bed_file: + type: File? + outputSource: sc_atac_dbinding/first_summits_bed_file + label: "MACS2 peaks summits in BED format (first)" + doc: | + MACS2 peaks summits in BED format called + from the Tn5 cut sites of the cells that + belong to the group defined by the --first + and --groupby parameters. + + second_summits_bed_file: + type: File? + outputSource: sc_atac_dbinding/second_summits_bed_file + label: "MACS2 peaks summits in BED format (second)" + doc: | + MACS2 peaks summits in BED format called + from the Tn5 cut sites of the cells that + belong to the group defined by the --second + and --groupby parameters. + + diff_bound_sites: + type: File + outputSource: sc_atac_dbinding/diff_bound_sites + label: "Differentially bound sites" + doc: | + Not filtered differentially bound sites + in TSV format + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'Diff bound sites' + Title: 'Differentially bound sites. Not filtered' + + diff_bound_sites_with_labels: + type: File + outputSource: add_label_column/output_file + label: "Differentially bound sites with labels" + doc: | + Not filtered differentially bound sites + with labels in TSV format + + first_enrch_bigbed_file: + type: File? + outputSource: sc_atac_dbinding/first_enrch_bigbed_file + label: "Significant differentially bound sites (first)" + doc: | + Peaks in bigBed format filtered by + --padj and --logfc thresholds enriched + in the group of cells defined by the + --first and --groupby parameters. + 'sd:visualPlugins': + - igvbrowser: + tab: 'Genome Browser' + id: 'igvbrowser' + type: 'annotation' + format: 'bigbed' + name: "Diff. bound sites (first)" + height: 40 + + second_enrch_bigbed_file: + type: File? + outputSource: sc_atac_dbinding/second_enrch_bigbed_file + label: "Significant differentially bound sites (second)" + doc: | + Peaks in bigBed format filtered by + --padj and --logfc thresholds enriched + in the group of cells defined by the + --second and --groupby parameters. + 'sd:visualPlugins': + - igvbrowser: + tab: 'Genome Browser' + id: 'igvbrowser' + type: 'annotation' + format: 'bigbed' + name: "Diff. bound sites (second)" + height: 40 + + first_enrch_bed_file: + type: File? + outputSource: sc_atac_dbinding/first_enrch_bed_file + label: "Significant differentially bound sites (first)" + doc: | + Peaks in BED format filtered by + --padj and --logfc thresholds enriched + in the group of cells defined by the + --first and --groupby parameters. + + second_enrch_bed_file: + type: File? + outputSource: sc_atac_dbinding/second_enrch_bed_file + label: "Significant differentially bound sites (second)" + doc: | + Peaks in BED format filtered by + --padj and --logfc thresholds enriched + in the group of cells defined by the + --second and --groupby parameters. + + volcano_plot_html_file: + type: File + outputSource: make_volcano_plot/html_file + label: "Volcano Plot" + doc: | + HTML index file for Volcano Plot + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + volcano_plot_html_data: + type: Directory + outputSource: make_volcano_plot/html_data + label: "Directory html data for Volcano Plot" + doc: | + Directory html data for Volcano Plot + + tag_density_matrix: + type: File + outputSource: compute_score_matrix/scores_matrix + label: "Score matrix for tag density heatmap" + doc: | + Scores matrix generated by + Deeptools with tag density + information around centers + of regions of interest. + + tag_density_heatmap: + type: File + outputSource: make_heatmap/heatmap_file + label: "Tag density heatmap" + doc: | + Tag density heatmap around centers + of differentially bound sites in + PNG format + 'sd:visualPlugins': + - image: + tab: 'Overall' + Caption: 'Tag density heatmap around centers of diff. bound sites' + + sc_atac_dbinding_stdout_log: + type: File + outputSource: sc_atac_dbinding/stdout_log + label: "stdout log generated by sc_atac_dbinding step" + doc: | + stdout log generated by sc_atac_dbinding step + + sc_atac_dbinding_stderr_log: + type: File + outputSource: sc_atac_dbinding/stderr_log + label: "stderr log generated by sc_atac_dbinding step" + doc: | + stderr log generated by sc_atac_dbinding step + + +steps: + + sc_atac_dbinding: + run: ../tools/sc-atac-dbinding.cwl + in: + query_data_rds: query_data_rds + atac_fragments_file: atac_fragments_file + datasets_metadata: datasets_metadata + barcodes_data: barcodes_data + groupby: + source: groupby + valueFrom: $(self==""?null:self) # safety measure + subset: + source: subset + valueFrom: $(split_features(self)) + splitby: splitby + first_cond: first_cond + second_cond: second_cond + analysis_method: analysis_method + genome_type: + source: genome_type + valueFrom: $(self=="mm10"?"mm":"hs") + minimum_qvalue: minimum_qvalue + minimum_peak_gap: minimum_peak_gap + bin_size: bin_size + maximum_peaks: + source: maximum_peaks + valueFrom: $(self==0?null:self) # to return null for 0 + blacklist_regions_file: blacklist_regions_file + maximum_padj: maximum_padj + minimum_logfc: minimum_logfc + verbose: + default: true + parallel_memory_limit: + source: parallel_memory_limit + valueFrom: $(parseInt(self)) + vector_memory_limit: + source: vector_memory_limit + valueFrom: $(parseInt(self)) + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - umap_rd_rnaumap_plot_png + - umap_rd_atacumap_plot_png + - umap_rd_wnnumap_plot_png + - seurat_peaks_bigbed_file + - first_fragments_bigwig_file + - second_fragments_bigwig_file + - first_tn5ct_bigwig_file + - second_tn5ct_bigwig_file + - first_peaks_xls_file + - second_peaks_xls_file + - first_peaks_bed_file + - second_peaks_bed_file + - first_summits_bed_file + - second_summits_bed_file + - diff_bound_sites + - dbnd_vlcn_plot_png + - first_enrch_bigbed_file + - second_enrch_bigbed_file + - first_enrch_bed_file + - second_enrch_bed_file + - stdout_log + - stderr_log + + add_label_column: + run: ../tools/custom-bash.cwl + in: + input_file: sc_atac_dbinding/diff_bound_sites + script: + default: | + HEADER=`head -n 1 $0`; + echo -e "label\t${HEADER}" > diff_sts_labeled.tsv; + cat "$0" | grep -v "start" | awk -F "\t" '{print $1":"$2"-"$3"\t"$0}' >> diff_sts_labeled.tsv + out: + - output_file + + make_volcano_plot: + run: ../tools/volcano-plot.cwl + in: + diff_expr_file: add_label_column/output_file + x_axis_column: + default: "log2FoldChange" + y_axis_column: + default: "padj" + label_column: + default: "label" + out: + - html_data + - html_file + + recenter_first_enrch_bed: + run: + cwlVersion: v1.0 + class: CommandLineTool + hints: + - class: DockerRequirement + dockerPull: biowardrobe2/scidap:v0.0.3 + requirements: + - class: InitialWorkDirRequirement + listing: + - entryname: dummy.csv + entry: | + chr1,1,10 + inputs: + script: + type: string? + default: | + #!/bin/bash + cat "$0" | tr -d "\r" | tr "," "\t" | awk NF | sort -u -k1,1 -k2,2n -k3,3n | awk '{center=$2+int(($3-$2)/2); print $1"\t"center"\t"center+1}' > first_recentered.bed + inputBinding: + position: 1 + regions_file: + type: + - "null" + - string + - File + inputBinding: + position: 5 + valueFrom: $(self==""?"dummy.csv":self) + default: "" + outputs: + recentered_regions_file: + type: File + outputBinding: + glob: "first_recentered.bed" + baseCommand: [bash, '-c'] + in: + regions_file: sc_atac_dbinding/first_enrch_bed_file + out: + - recentered_regions_file + + recenter_second_enrch_bed: + run: + cwlVersion: v1.0 + class: CommandLineTool + hints: + - class: DockerRequirement + dockerPull: biowardrobe2/scidap:v0.0.3 + requirements: + - class: InitialWorkDirRequirement + listing: + - entryname: dummy.csv + entry: | + chr1,1,10 + inputs: + script: + type: string? + default: | + #!/bin/bash + cat "$0" | tr -d "\r" | tr "," "\t" | awk NF | sort -u -k1,1 -k2,2n -k3,3n | awk '{center=$2+int(($3-$2)/2); print $1"\t"center"\t"center+1}' > second_recentered.bed + inputBinding: + position: 1 + regions_file: + type: + - "null" + - string + - File + inputBinding: + position: 5 + valueFrom: $(self==""?"dummy.csv":self) + default: "" + outputs: + recentered_regions_file: + type: File + outputBinding: + glob: "second_recentered.bed" + baseCommand: [bash, '-c'] + in: + regions_file: sc_atac_dbinding/second_enrch_bed_file + out: + - recentered_regions_file + + compute_score_matrix: + run: ../tools/deeptools-computematrix-referencepoint.cwl + in: + score_files: + source: + - analysis_method + - sc_atac_dbinding/first_fragments_bigwig_file + - sc_atac_dbinding/second_fragments_bigwig_file + - sc_atac_dbinding/first_tn5ct_bigwig_file + - sc_atac_dbinding/second_tn5ct_bigwig_file + valueFrom: | + ${ + if (self[0].analysis_method != "manorm2" ) { + return [self[1], self[2]]; + } else { + return [self[3], self[4]]; + } + } + regions_files: + - recenter_first_enrch_bed/recentered_regions_file + - recenter_second_enrch_bed/recentered_regions_file + reference_point: + default: "TSS" # doesn't matter what we set here because we centered regions ourlselves + before_region_start_length: + default: 5000 + after_region_start_length: + default: 5000 + bin_size: + default: 10 + sort_regions: + default: "descend" + samples_label: + source: + - first_cond + - second_cond + valueFrom: $(["Reads " + self[0], "Reads " + self[1]]) + output_filename: + default: "score_matrix.gz" + missing_data_as_zero: + default: true + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - scores_matrix + - stdout_log + - stderr_log + + make_heatmap: + run: ../tools/deeptools-plotheatmap.cwl + in: + plot_title: + default: "Tag density around peak centers" + scores_matrix: compute_score_matrix/scores_matrix + output_filename: + default: "tag_density_heatmap.png" + plot_type: + default: "lines" + sort_regions: + default: "descend" + average_type_summary_plot: + default: "mean" + what_to_show: + default: "plot, heatmap and colorbar" + ref_point_label: + default: "Peak Center" + regions_label: + source: + - first_cond + - second_cond + valueFrom: $(["Peaks " + self[0], "Peaks " + self[1]]) + samples_label: + source: + - first_cond + - second_cond + valueFrom: $(["Reads " + self[0], "Reads " + self[1]]) + x_axis_label: + default: "distance (bp)" + y_axisLabel: + default: "Signal mean" + per_group: + default: false + plot_file_format: + default: "png" + legend_location: + default: "upper-left" + out: + - heatmap_file + - stdout_log + - stderr_log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Single-cell ATAC-Seq Differential Binding Analysis" +s:name: "Single-cell ATAC-Seq Differential Binding Analysis" +s:alternateName: "Identifies differential bound sites between two groups of cells" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-atac-dbinding.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell ATAC-Seq Differential Binding Analysis + + Identifies differential bound sites between two + groups of cells \ No newline at end of file diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index 332f84a5..625f2f5e 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -78,8 +78,8 @@ inputs: provided, use from 2 to N LSI components. If multiple values are provided, subset to only selected LSI components. In combination with --ntgr set to harmony, - selected principle components will be used in Harmony - integration. + multiple values will result in using all dimensions + starting from 1(!) to the max of the provided values. Default: from 2 to 10 normalization_method: @@ -332,6 +332,78 @@ outputs: tab: 'Per dataset' Caption: 'Split by dataset cells UMAP' + umap_spl_umi_plot_png: + type: File? + outputSource: sc_atac_reduce/umap_spl_umi_plot_png + label: "Split by the UMI per cell counts cells UMAP" + doc: | + Split by the UMI per cell counts cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by the UMI per cell counts cells UMAP' + + umap_spl_peak_plot_png: + type: File? + outputSource: sc_atac_reduce/umap_spl_peak_plot_png + label: "Split by the peaks per cell counts cells UMAP" + doc: | + Split by the peaks per cell counts cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by the peaks per cell counts cells UMAP' + + umap_spl_tss_plot_png: + type: File? + outputSource: sc_atac_reduce/umap_spl_tss_plot_png + label: "Split by the TSS enrichment score cells UMAP" + doc: | + Split by the TSS enrichment score cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by the TSS enrichment score cells UMAP' + + umap_spl_ncls_plot_png: + type: File? + outputSource: sc_atac_reduce/umap_spl_ncls_plot_png + label: "Split by the nucleosome signal cells UMAP" + doc: | + Split by the nucleosome signal cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by the nucleosome signal cells UMAP' + + umap_spl_frip_plot_png: + type: File? + outputSource: sc_atac_reduce/umap_spl_frip_plot_png + label: "Split by the FRiP cells UMAP" + doc: | + Split by the FRiP cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by the FRiP cells UMAP' + + umap_spl_blck_plot_png: + type: File? + outputSource: sc_atac_reduce/umap_spl_blck_plot_png + label: "Split by the genomic blacklist regions fraction cells UMAP" + doc: | + Split by the genomic blacklist regions fraction cells UMAP. + PNG format + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Split by the genomic blacklist regions fraction cells UMAP' + umap_spl_cnd_plot_png: type: File? outputSource: sc_atac_reduce/umap_spl_cnd_plot_png @@ -409,6 +481,12 @@ steps: - umap_plot_png - umap_spl_idnt_plot_png - umap_spl_cnd_plot_png + - umap_spl_umi_plot_png + - umap_spl_peak_plot_png + - umap_spl_tss_plot_png + - umap_spl_ncls_plot_png + - umap_spl_frip_plot_png + - umap_spl_blck_plot_png - seurat_data_rds - stdout_log - stderr_log @@ -462,4 +540,5 @@ s:creator: doc: | Single-cell ATAC-Seq Dimensionality Reduction Analysis - Integrates multiple single-cell ATAC-Seq datasets, reduces dimensionality using LSI. \ No newline at end of file + Integrates multiple single-cell ATAC-Seq datasets, + reduces dimensionality using LSI. \ No newline at end of file diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index db9a6ce7..cc45dee6 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -819,5 +819,5 @@ s:creator: doc: | Single-cell Manual Cell Type Assignment - + Assigns cell types for clusters based on the provided metadata file. \ No newline at end of file diff --git a/workflows/sc-format-transform.cwl b/workflows/sc-format-transform.cwl new file mode 100644 index 00000000..9783c1a1 --- /dev/null +++ b/workflows/sc-format-transform.cwl @@ -0,0 +1,133 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: +- class: SubworkflowFeatureRequirement +- class: StepInputExpressionRequirement +- class: InlineJavascriptRequirement +- class: MultipleInputFeatureRequirement + + +inputs: + + alias: + type: string + label: "Experiment short name/Alias" + sd:preview: + position: 1 + + compressed_sparse_matrix: + type: File + label: "Compressed folder with feature-barcode matrix in MEX format" + doc: | + Compressed folder with feature-barcode matrix from + Cell Ranger Count/Aggregate experiment in MEX format + + metadata: + type: File? + label: "Aggregation metadata in CSV format" + doc: | + Aggregation metadata file from Cell Ranger + Aggregate experiment + + +outputs: + + filtered_feature_bc_matrix_folder: + type: File + outputSource: pipe/filtered_feature_bc_matrix_folder + label: "Compressed folder with feature-barcode matrix in MEX format" + doc: | + Compressed folder with feature-barcode matrix from + Cell Ranger Count/Aggregate experiment in MEX format + + aggregation_metadata: + type: File? + outputSource: pipe/aggregation_metadata + label: "Aggregation metadata in CSV format" + doc: | + Aggregation metadata file from Cell Ranger + Aggregate experiment + + +steps: + + pipe: + run: + cwlVersion: v1.0 + class: ExpressionTool + inputs: + compressed_sparse_matrix: + type: File + metadata: + type: File? + outputs: + filtered_feature_bc_matrix_folder: + type: File + aggregation_metadata: + type: File? + expression: | + ${ + return { + "filtered_feature_bc_matrix_folder": inputs.compressed_sparse_matrix, + "aggregation_metadata": inputs.metadata + }; + } + in: + compressed_sparse_matrix: compressed_sparse_matrix + metadata: metadata + out: + - filtered_feature_bc_matrix_folder + - aggregation_metadata + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Single-cell Format Transform" +s:name: "Single-cell Format Transform" +s:alternateName: "Transforms single-cell sequencing data formats into Cell Ranger like output" + +s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/sc-format-transform.cwl +s:codeRepository: https://github.com/datirium/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell Format Transform + + Transforms single-cell sequencing data formats into Cell Ranger like output diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index 2914f9d2..8eee63cb 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -65,6 +65,14 @@ inputs: 'sd:upstreamSource': "sc_arc_sample/genome_indices/genome_indices/annotation_gtf" 'sd:localLabel': true + chrom_length_file: + type: File + label: "Cell Ranger ARC Count/Aggregate Experiment" + doc: | + Chromosome length file in TSV format + 'sd:upstreamSource': "sc_arc_sample/genome_indices/chrom_length_file" + 'sd:localLabel': true + grouping_data: type: File? label: "Optional TSV/CSV file to define datasets grouping with 'library_id' and 'condition' columns. Rows order should correspond to the aggregation metadata." @@ -246,6 +254,70 @@ inputs: 'sd:layout': advanced: true + remove_doublets: + type: + - type: enum + symbols: + - "union" + - "onlyrna" + - "onlyatac" + - "intersect" + - "none" + default: "none" + label: "Remove cells that were identified as doublets" + doc: | + Remove cells that were identified as doublets. For + RNA assay cells with UMI < 200 will not be evaluated. + Default: do not remove doublets + 'sd:layout': + advanced: true + + rna_doublet_rate: + type: float? + default: null + label: "Expected RNA doublet rate" + doc: | + Expected RNA doublet rate. Default: 1 percent per + thousand cells captured with 10x genomics + 'sd:layout': + advanced: true + + rna_doublet_rate_sd: + type: float? + default: null + label: "Uncertainty range in the RNA doublet rate" + doc: | + Uncertainty range in the RNA doublet rate, interpreted as + a +/- around the value provided in --rnadbr. Set to 0 to + disable. Set to 1 to make the threshold depend entirely + on the misclassification rate. Default: 40 percents of the + value provided in --rnadbr + 'sd:layout': + advanced: true + + atac_doublet_rate: + type: float? + default: null + label: "Expected ATAC doublet rate" + doc: | + Expected ATAC doublet rate. Default: 1 percent per thousand + cells captured with 10x genomics + 'sd:layout': + advanced: true + + atac_doublet_rate_sd: + type: float? + default: null + label: "Uncertainty range in the ATAC doublet rate" + doc: | + Uncertainty range in the ATAC doublet rate, interpreted as + a +/- around the value provided in --atacdbr. Set to 0 to + disable. Set to 1 to make the threshold depend entirely + on the misclassification rate. Default: 40 percents of the + value provided in --atacdbr + 'sd:layout': + advanced: true + color_theme: type: - "null" @@ -483,6 +555,42 @@ outputs: tab: 'Not filtered QC' Caption: 'QC metrics per cell density' + raw_rnadbl_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_rnadbl_plot_png + label: "Percentage of RNA doublets per dataset (not filtered)" + doc: | + Percentage of RNA doublets per dataset (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Percentage of RNA doublets per dataset' + + raw_atacdbl_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_atacdbl_plot_png + label: "Percentage of ATAC doublets per dataset (not filtered)" + doc: | + Percentage of ATAC doublets per dataset (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Percentage of ATAC doublets per dataset' + + raw_vrlpdbl_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_vrlpdbl_plot_png + label: "Doublets overlap for RNA and ATAC assays per dataset (not filtered)" + doc: | + Doublets overlap for RNA and ATAC assays per dataset (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Doublets overlap for RNA and ATAC assays per dataset' + raw_tss_nrch_plot_png: type: File? outputSource: sc_multiome_filter/raw_tss_nrch_plot_png @@ -761,6 +869,42 @@ outputs: tab: 'Mid. filtered QC' Caption: 'QC metrics per cell density' + mid_fltr_rnadbl_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_rnadbl_plot_png + label: "Percentage of RNA doublets per dataset (intermediate filtered)" + doc: | + Percentage of RNA doublets per dataset (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'Percentage of RNA doublets per dataset' + + mid_fltr_atacdbl_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_atacdbl_plot_png + label: "Percentage of ATAC doublets per dataset (intermediate filtered)" + doc: | + Percentage of ATAC doublets per dataset (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'Percentage of ATAC doublets per dataset' + + mid_fltr_vrlpdbl_plot_png: + type: File? + outputSource: sc_multiome_filter/mid_fltr_vrlpdbl_plot_png + label: "Doublets overlap for RNA and ATAC assays per dataset (intermediate filtered)" + doc: | + Doublets overlap for RNA and ATAC assays per dataset (intermediate filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Mid. filtered QC' + Caption: 'Doublets overlap for RNA and ATAC assays per dataset' + mid_fltr_tss_nrch_plot_png: type: File? outputSource: sc_multiome_filter/mid_fltr_tss_nrch_plot_png @@ -1015,6 +1159,42 @@ outputs: tab: 'Filtered QC' Caption: 'UMI per cell correlation for RNA vs ATAC assays' + fltr_rnadbl_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_rnadbl_plot_png + label: "Percentage of RNA doublets per dataset (filtered)" + doc: | + Percentage of RNA doublets per dataset (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Percentage of RNA doublets per dataset' + + fltr_atacdbl_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_atacdbl_plot_png + label: "Percentage of ATAC doublets per dataset (filtered)" + doc: | + Percentage of ATAC doublets per dataset (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Percentage of ATAC doublets per dataset' + + fltr_vrlpdbl_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_vrlpdbl_plot_png + label: "Doublets overlap for RNA and ATAC assays per dataset (filtered)" + doc: | + Doublets overlap for RNA and ATAC assays per dataset (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Doublets overlap for RNA and ATAC assays per dataset' + fltr_tss_atac_umi_corr_plot_png: type: File? outputSource: sc_multiome_filter/fltr_tss_atac_umi_corr_plot_png @@ -1217,6 +1397,7 @@ steps: aggregation_metadata: aggregation_metadata atac_fragments_file: atac_fragments_file annotation_gtf_file: annotation_gtf_file + chrom_length_file: chrom_length_file grouping_data: grouping_data blacklist_regions_file: blacklist_regions_file barcodes_data: barcodes_data @@ -1254,6 +1435,21 @@ steps: source: maximum_blacklist_fraction valueFrom: $(split_numbers(self)) call_by: call_by + remove_doublets: + source: remove_doublets + valueFrom: $(self=="none"?null:self) + rna_doublet_rate: + source: rna_doublet_rate + valueFrom: $(self==""?null:self) # safety measure + rna_doublet_rate_sd: + source: rna_doublet_rate_sd + valueFrom: $(self==""?null:self) # safety measure + atac_doublet_rate: + source: atac_doublet_rate + valueFrom: $(self==""?null:self) # safety measure + atac_doublet_rate_sd: + source: atac_doublet_rate_sd + valueFrom: $(self==""?null:self) # safety measure verbose: default: true export_ucsc_cb: @@ -1283,6 +1479,9 @@ steps: - raw_rna_atac_umi_corr_plot_png - raw_tss_atac_umi_corr_plot_png - raw_qc_mtrcs_dnst_plot_png + - raw_rnadbl_plot_png + - raw_atacdbl_plot_png + - raw_vrlpdbl_plot_png - raw_tss_nrch_plot_png - raw_frgm_hist_png - raw_rna_umi_dnst_spl_cnd_plot_png @@ -1306,6 +1505,9 @@ steps: - mid_fltr_rna_atac_umi_corr_plot_png - mid_fltr_tss_atac_umi_corr_plot_png - mid_fltr_qc_mtrcs_dnst_plot_png + - mid_fltr_rnadbl_plot_png + - mid_fltr_atacdbl_plot_png + - mid_fltr_vrlpdbl_plot_png - mid_fltr_tss_nrch_plot_png - mid_fltr_frgm_hist_png - mid_fltr_rna_umi_dnst_spl_cnd_plot_png @@ -1327,6 +1529,9 @@ steps: - fltr_peak_dnst_plot_png - fltr_blck_dnst_plot_png - fltr_rna_atac_umi_corr_plot_png + - fltr_rnadbl_plot_png + - fltr_atacdbl_plot_png + - fltr_vrlpdbl_plot_png - fltr_tss_atac_umi_corr_plot_png - fltr_qc_mtrcs_dnst_plot_png - fltr_tss_nrch_plot_png @@ -1400,6 +1605,6 @@ s:creator: doc: | Single-cell Multiome ATAC and RNA-Seq Filtering Analysis - + Filters single-cell multiome ATAC and RNA-Seq datasets based on the common QC metrics. \ No newline at end of file diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index f8f9a10a..34867ada 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -478,6 +478,13 @@ outputs: doc: | Processed Seurat data in RDS format + seurat_data_scope: + type: File? + outputSource: sc_rna_cluster/seurat_data_scope + label: "Processed Seurat data in SCope compatible loom format" + doc: | + Processed Seurat data in SCope compatible loom format + sc_rna_cluster_stdout_log: type: File outputSource: sc_rna_cluster/stdout_log @@ -520,6 +527,8 @@ steps: default: true export_ucsc_cb: default: true + export_scope_data: + default: true color_theme: color_theme parallel_memory_limit: source: parallel_memory_limit @@ -552,6 +561,7 @@ steps: - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds + - seurat_data_scope - stdout_log - stderr_log @@ -610,5 +620,5 @@ s:creator: doc: | Single-cell RNA-Seq Cluster Analysis - + =============================================================== Clusters single-cell RNA-Seq datasets, identifies gene markers. \ No newline at end of file diff --git a/workflows/sc-rna-da-cells.cwl b/workflows/sc-rna-da-cells.cwl index a4a8c20a..753bef07 100644 --- a/workflows/sc-rna-da-cells.cwl +++ b/workflows/sc-rna-da-cells.cwl @@ -469,4 +469,5 @@ s:creator: doc: | Single-cell Differential Abundance Analysis - Detects cell subpopulations with differential abundance between datasets split by biological condition. \ No newline at end of file + Detects cell subpopulations with differential abundance + between datasets split by biological condition. \ No newline at end of file diff --git a/workflows/sc-rna-de-pseudobulk.cwl b/workflows/sc-rna-de-pseudobulk.cwl index 83454378..0ba9cc2c 100644 --- a/workflows/sc-rna-de-pseudobulk.cwl +++ b/workflows/sc-rna-de-pseudobulk.cwl @@ -35,161 +35,171 @@ inputs: query_data_rds: type: File - label: "Experiment run through any of the Single-cell Cluster or Manual Cell Type Assignment Analysis" + label: "Single-cell Cluster or Manual Cell Type Assignment Analysis" doc: | - Path to the RDS file to load Seurat object from. This file should include genes - expression information stored in the RNA assay. Additionally, 'rnaumap', and/or - 'atacumap', and/or 'wnnumap' dimensionality reductions should be present. + Single-cell analysis run through the + clustering or cell type assignment + pipelines. 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" 'sd:localLabel': true - splitby: - type: string - label: "Column from the Seurat object metadata to split datasets into two groups" - doc: | - Column from the Seurat object metadata to split datasets into two groups - to run --second vs --first pseudobulk DE analysis, i.e., calculate log2FC. - May be one of the columns from the extra metadata added with --metadata - parameter. Provided value should group the datasets, not cells, therefore - do not use a column with clustering results. - - first_cond: - type: string - label: "Value from the Seurat object metadata column to define the first group of datasets" - doc: | - Value from the Seurat object metadata column set with --splitby to define the - first group of datasets for pseudobulk DE analysis. - - second_cond: - type: string - label: "Value from the Seurat object metadata column to define the second group of datasets" - doc: | - Value from the Seurat object metadata column set with --splitby to define the - second group of datasets for pseudobulk DE analysis. - - batchby: - type: string? - default: null - label: "Column from the Seurat object metadata to group datasets into batches" + datasets_metadata: + type: File? + label: "TSV/CSV file to assign categories per sample" + doc: | + If selected single-cell analysis was run + with the data aggregated from multiple + samples, you can optionally provide tab- + delimited or comma-separated file to + assign additional categories per sample. + First column should be named 'library_id' + and include all sample names from the + selected single-cell analysis regardless + whether filtering by barcodes was applied + or not. All other columns may have + arbitrary names. + + barcodes_data: + type: File? + label: "TSV/CSV file to filter cells by barcodes" doc: | - Column from the Seurat object metadata to group datasets into batches. It will be used - as a factor variable to model batch effect when running pseudobulk DE analysis (makes - design formula look like ~splitby+batchby). May be one of the columns from the extra - metadata added with --metadata parameter. Provided value should batch the datasets, not - cells, therefore do not use a column with clustering results. Default: do not model - batch effect. + Loaded single-cell data can be optionally + prefiltered by selected cell barcodes. + Provided tab-delimited or comma-separated + file should have the first column named + 'barcode'. If this file includes any other + columns, they will be used to assign + additional categories per cell. groupby: type: string? default: null - label: "Column from the Seurat object metadata to group cells for optional subsetting" + label: "Category to group cells for optional subsetting" doc: | - Column from the Seurat object metadata to group cells for optional subsetting - when combined with --subset parameter. May be one of the columns from the extra - metadata added with --metadata parameter. Ignored if --subset is not set. Provided - value defines the groups of cells, therefore any metadata column, including the - clustering results, may be used. Default: do not subset, run pseudobulk DE analysis - for all cells jointly + Before running differential expression + analysis input data can be optionally + prefiltered to include only certain + values from the specific category. + Here we define the name of that + category. subset: type: string? default: null - label: "Value(s) to subset cells before running analysis" + label: "List of values to subset cells from the selected category" doc: | - Value(s) from the column set with --groupby parameter to subset cells - before running pseudobulk DE analysis. If multiple values are provided - run analysis jointly for selected groups of cells. Ignored if --groupby - is not set. Default: do not subset, run pseudobulk DE analysis for all - cells jointly + If the category to group cells for + optional subsetting was provided, + here we define which values should + be included into analysis. - datasets_metadata: - type: File? - label: "Path to the TSV/CSV file to optionally extend Seurat object metadata" + splitby: + type: string + label: "Category to split cell into two groups" doc: | - Path to the TSV/CSV file to optionally extend Seurat object metadata with - categorical values using samples identities. First column - 'library_id' - should correspond to all unique values from the 'new.ident' column of the - loaded Seurat object. If any of the provided in this file columns are already - present in the Seurat object metadata, they will be overwritten. Default: no - extra metadata is added + All remaining after optional prefiltering + steps cells will be split into two groups + for gene expression comparison. - lrt: - type: boolean? - default: false - label: "Use LRT instead of the pair-wise Wald test" + first_cond: + type: string + label: "Value from the selected category to define the first group of cells" doc: | - Use LRT instead of the pair-wise Wald test. If --batchby is not provided - use ~1 as a reduced formula, otherwise ~batchby. Default: use Wald test - 'sd:layout': - advanced: true + Cells for which the selected category + includes provided value will be used + as the first group for differential + expression comparison. Direction of + comparison is second vs first groups. + + second_cond: + type: string + label: "Value from the selected category to define the second group of cells" + doc: | + Cells for which the selected category + includes provided value will be used + as the second group for differential + expression comparison. Direction of + comparison is second vs first groups. + + analysis_method: + type: + - "null" + - type: enum + symbols: + - "wilcoxon (by cells, no batches)" # (wilcox) Wilcoxon Rank Sum test + - "likelihood-ratio (by cells, no batches)" # (bimod) Likelihood-ratio test + - "t-test (by cells, no batches)" # (t) Student's t-test + - "negative-binomial (by cells, models batches)" # (negbinom) Negative Binomial Generalized Linear Model (supports --batchby) + - "poisson (by cells, models batches)" # (poisson) Poisson Generalized Linear Model (supports --batchby) + - "logistic-regression (by cells, models batches)" # (LR) Logistic Regression (supports --batchby) + - "mast (by cells, models batches)" # (MAST) MAST package (supports --batchby) + - "deseq (pseudo bulk, models batches)" # DESeq2 Wald test on pseudobulk aggregated gene expression + - "deseq-lrt (pseudo bulk, models batches)" # DESeq2 LRT test on pseudobulk aggregated gene expression + default: wilcoxon + label: "Test type to use in differential expression analysis" + doc: | + Test type to use in the differential + expression analysis. If set to deseq + or deseq-lrt, gene expression will be + aggregated to the pseudobulk form per + sample. Othwerwise, analysis will be + run on the cells level. If deseq is + selected, the pair-wise Wald test will + be used. For deseq-lrt, the Likelihood + Ratio Test will be applied between + design and reduced formulas. The + reduced formula will look like ~1 if + grouping by batches is omitted or will + be set to the category defined as + batches. + + batchby: + type: string? + default: null + label: "Category to model batch effect" + doc: | + If selected test type supports batch + effect modeling, the provided category + will be used to group cells into + batches. For deseq and deseq-lrt tests + batch modeling will result in adding it + into the design formula. For negative- + binomial, poisson, logistic-regression, + or mast tests grouping by batches will + be used as a latent variable in the + FindMarkers function. maximum_padj: type: float? default: 0.05 - label: "Maximum significance level used in the exploratory visualization part of the analysis" + label: "Maximum adjusted P-value for genes displayed on the heatmap" doc: | - In the exploratory visualization part of the analysis output only features - with adjusted P-value not bigger than this value. Default: 0.05 - 'sd:layout': - advanced: true + When generating gene expression heatmap + per cell output only differentially + expressed genes with the adjusted P-value + not bigger than this value. genes_of_interest: type: string? default: null - label: "Genes of interest to label on the generated plots" + label: "Genes of interest to be shown on the plots" doc: | - Genes of interest to label on the generated plots. Default: top 10 genes - with the highest and the lowest log2FC expression values. + Genes of interest to be shown on the + volcano, violin, and UMAP plots. 'sd:layout': advanced: true exclude_pattern: type: string? default: null - label: "Regex pattern to identify and exclude non-coding RNA genes from the analysis" - doc: | - Regex pattern to identify and exclude non-coding RNA genes from the pseudobulk - DE analysis (not case-sensitive). If any of such genes were provided in the --genes - parameter, they will be excluded from there as well. - 'sd:layout': - advanced: true - - normalization_method: - type: - - "null" - - type: enum - symbols: - - "vst" - - "rlog" - default: "rlog" - label: "Read counts normalization for the exploratory visualization part of the analysis" - doc: | - Read counts normalization for the exploratory visualization part of the analysis. - Use 'vst' for medium-to-large datasets (n > 30) and 'rlog' for small datasets - (n < 30), when there is a wide range of sequencing depth across samples. - Default: rlog - 'sd:layout': - advanced: true - - remove: - type: boolean? - default: false - label: "Remove batch effect when generating normalized read counts" - doc: | - Remove batch effect when generating normalized read counts for the exploratory - visualization part of the analysis. Ignored if --batchby is not provided. - Default: do not remove batch effect from normalized read counts. - 'sd:layout': - advanced: true - - center_row: - type: boolean? - default: false - label: "Apply mean centering for feature expression prior to running clustering by row" - doc: | - Apply mean centering for gene expression prior to running - clustering by row. Ignored if --cluster is set to column or - not provided. Default: do not centered + label: "Regex pattern to identify and exclude specific genes from the analysis" + doc: | + Regex pattern to identify and exclude + specific genes from the differential + expression analysis (not case-sensitive). + If any of such genes were selected as + genes of interest to be shown on the plots, + they will be excluded from there as well. 'sd:layout': advanced: true @@ -202,12 +212,16 @@ inputs: - "column" - "both" - "none" - default: "none" - label: "Hopach clustering method to be run on normalized read counts" - doc: | - Hopach clustering method to be run on normalized read counts for the - exploratory visualization part of the analysis. Default: do not run - clustering + default: "row" + label: "Clustering method for gene expression data" + doc: | + Clustering method to be run on + the normalized read counts data. + "column" and "both" options are + supported only when using deseq + or desey-lrt tests for which gene + expression data aggregated to the + pseudobulk form. 'sd:layout': advanced: true @@ -223,10 +237,11 @@ inputs: - "cor" - "abscor" default: "cosangle" - label: "Distance metric for HOPACH row clustering" + label: "Distance metric for row clustering" doc: | - Distance metric for HOPACH row clustering. Ignored if --cluster is set - to column or not provided. Default: cosangle + Distance metric for row clustering. + Ignored if clustering method is set + to "column" or "none". 'sd:layout': advanced: true @@ -242,10 +257,24 @@ inputs: - "cor" - "abscor" default: "euclid" - label: "Distance metric for HOPACH column clustering" + label: "Distance metric for column clustering" doc: | - Distance metric for HOPACH column clustering. Ignored if --cluster is set - to row or not provided. Default: euclid + Distance metric for column clustering. + Ignored if clustering method is set + to "row" or "none". + 'sd:layout': + advanced: true + + center_row: + type: boolean? + default: true + label: "Gene expression mean centering for clustering by row" + doc: | + Apply mean centering for gene + expression prior to running + clustering by row. Ignored if + clustering method is set to + "column" or "none". 'sd:layout': advanced: true @@ -263,11 +292,9 @@ inputs: - "classic" - "void" default: "classic" - label: "Color theme for all generated plots" + label: "Color theme" doc: | - Color theme for all generated plots. One of gray, bw, linedraw, light, - dark, minimal, classic, void. - Default: classic + Color theme for all generated plots. 'sd:layout': advanced: true @@ -278,11 +305,11 @@ inputs: symbols: - "32" default: "32" - label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" + label: "Maximum shared memory in GB" doc: | - Maximum memory in GB allowed to be shared between the workers - when using multiple --cpus. - Forced to 32 GB + Maximum memory in GB allowed to + be shared between the workers + when using multiple CPUs. 'sd:layout': advanced: true @@ -293,10 +320,10 @@ inputs: symbols: - "64" default: "64" - label: "Maximum vector memory in GB allowed to be used by R" + label: "Maximum vector memory in GB" doc: | - Maximum vector memory in GB allowed to be used by R. - Forced to 64 GB + Maximum vector memory in GB + allowed to be used by R. 'sd:layout': advanced: true @@ -307,10 +334,9 @@ inputs: symbols: - "1" default: "1" - label: "Number of cores/cpus to use" + label: "Number of cores/cpus" doc: | Number of cores/cpus to use - Forced to 1 'sd:layout': advanced: true @@ -320,52 +346,52 @@ outputs: umap_rd_rnaumap_plot_png: type: File? outputSource: de_pseudobulk/umap_rd_rnaumap_plot_png - label: "Cells RNA UMAP split by selected biological condition" + label: "Cells RNA UMAP split by selected criteria" doc: | - Cells UMAP split by selected biological condition, optionally - subsetted to the specific cluster or cell type (rnaumap dim. - reduction). + Cells UMAP split by selected criteria, + optionally subsetted to the specific + group (rnaumap dim. reduction). PNG format 'sd:visualPlugins': - image: tab: 'Overall' - Caption: 'Cells RNA UMAP split by selected biological condition' + Caption: 'Cells RNA UMAP split by selected criteria' umap_rd_atacumap_plot_png: type: File? outputSource: de_pseudobulk/umap_rd_atacumap_plot_png - label: "Cells ATAC UMAP split by selected biological condition" + label: "Cells ATAC UMAP split by selected criteria" doc: | - Cells UMAP split by selected biological condition, optionally - subsetted to the specific cluster or cell type (atacumap dim. - reduction). + Cells UMAP split by selected criteria, + optionally subsetted to the specific + group (atacumap dim. reduction). PNG format 'sd:visualPlugins': - image: tab: 'Overall' - Caption: 'Cells ATAC UMAP split by selected biological condition' + Caption: 'Cells ATAC UMAP split by selected criteria' umap_rd_wnnumap_plot_png: type: File? outputSource: de_pseudobulk/umap_rd_wnnumap_plot_png - label: "Cells WNN UMAP split by selected biological condition" + label: "Cells WNN UMAP split by selected criteria" doc: | - Cells UMAP split by selected biological condition, optionally - subsetted to the specific cluster or cell type (wnnumap dim. - reduction). + Cells UMAP split by selected criteria, + optionally subsetted to the specific + group (wnnumap dim. reduction). PNG format 'sd:visualPlugins': - image: tab: 'Overall' - Caption: 'Cells WNN UMAP split by selected biological condition' + Caption: 'Cells WNN UMAP split by selected criteria' mds_plot_html: type: File? outputSource: de_pseudobulk/mds_plot_html - label: "MDS plot of normalized counts" + label: "Interactive MDS Plot" doc: | - MDS plot of normalized counts. Optionally batch corrected - if --remove was set to True. + MDS plot of pseudobulk aggregated + normalized reads counts. All genes. HTML format 'sd:visualPlugins': - linkList: @@ -375,7 +401,7 @@ outputs: volcano_plot_html_file: type: File outputSource: make_volcano_plot/html_file - label: "Volcano Plot" + label: "Interactive Volcano Plot" doc: | HTML index file for Volcano Plot 'sd:visualPlugins': @@ -390,28 +416,10 @@ outputs: doc: | Directory html data for Volcano Plot - ma_plot_html_file: - type: File - outputSource: make_ma_plot/html_file - label: "MA-plot" - doc: | - HTML index file for MA-plot - 'sd:visualPlugins': - - linkList: - tab: 'Overview' - target: "_blank" - - ma_plot_html_data: - type: Directory - outputSource: make_ma_plot/html_data - label: "Directory html data for Volcano Plot" - doc: | - Directory html data for MA-plot - heatmap_html: type: File outputSource: morpheus_heatmap/heatmap_html - label: "Heatmap of normalized counts" + label: "Interactive Gene Expression Heatmap" doc: | Morpheus heatmap in HTML format 'sd:visualPlugins': @@ -422,60 +430,60 @@ outputs: pca_1_2_plot_png: type: File? outputSource: de_pseudobulk/pca_1_2_plot_png - label: "Normalized counts PCA (PC1 and PC2)" + label: "Normalized reads counts PCA (1, 2). All genes." doc: | - Normalized counts PCA (PC1 and PC2) subsetted to all DE genes regardless - of Padj, optionally batch corrected by the selected criteria. + Normalized reads counts PCA (1, 2). All genes. PNG format 'sd:visualPlugins': - image: tab: 'Overall' - Caption: 'Normalized counts PCA (PC1 and PC2)' + Caption: 'Normalized reads counts PCA (1, 2). All genes' pca_2_3_plot_png: type: File? outputSource: de_pseudobulk/pca_2_3_plot_png - label: "Normalized counts PCA (PC2 and PC3)" + label: "Normalized reads counts PCA (2, 3). All genes." doc: | - Normalized counts PCA (PC2 and PC3) subsetted to all DE genes regardless - of Padj, optionally batch corrected by the selected criteria. + Normalized reads counts PCA (2, 3). All genes. PNG format 'sd:visualPlugins': - image: tab: 'Overall' - Caption: 'Normalized counts PCA (PC2 and PC3)' + Caption: 'Normalized reads counts PCA (2, 3). All genes' dxpr_vlcn_plot_png: type: File? outputSource: de_pseudobulk/dxpr_vlcn_plot_png label: "Volcano plot of differentially expressed genes" doc: | - Volcano plot of differentially expressed genes. Highlighed genes are either - provided by user or top 10 genes with the highest log2FC values. The direction - of comparison is defined by --second vs --first groups of cells optionally - subsetted to the specific cluster or cell type and coerced to the pseudobulk - RNA-Seq samples. + Volcano plot of differentially expressed genes. + Highlighed genes are either provided by user or + top 10 genes with the highest log2FoldChange + values. The direction of comparison is defined + as --second vs --first. Cells are optionally + subsetted to the specific group and optionally + coerced to the pseudobulk form. PNG format 'sd:visualPlugins': - image: - tab: 'Overall' + tab: 'Gene expression' Caption: 'Volcano plot of differentially expressed genes' xpr_dnst_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: de_pseudobulk/xpr_dnst_plot_png - label: "Log normalized gene expression density per dataset" + label: "Log normalized gene expression density plots" doc: | - Log normalized gene expression density per dataset optionally subsetted to the - specific cluster or cell type. + Log normalized gene expression density plots for + either user provided or top 10 differentially + expressed genes with the highest log2FoldChange + values. The direction of comparison is defined + as --second vs --first. PNG format 'sd:visualPlugins': - image: tab: 'Gene expression' - Caption: 'Log normalized gene expression density per dataset' + Caption: 'Log normalized gene expression density plots' xpr_per_cell_rd_rnaumap_plot_png: type: @@ -483,15 +491,16 @@ outputs: - type: array items: File outputSource: de_pseudobulk/xpr_per_cell_rd_rnaumap_plot_png - label: "Log normalized gene expression on cells RNA UMAP per dataset" + label: "Log normalized gene expression on cells RNA UMAP" doc: | - Log normalized gene expression on cells UMAP per dataset optionally subsetted - to the specific cluster or cell type (rnaumap dim. reduction). + Log normalized gene expression on cells UMAP + split by selected criteria, optionally subsetted + to the specific group (rnaumap dim. reduction). PNG format 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression on cells RNA UMAP per dataset' + tab: 'Gene expression RNA' + Caption: 'Log normalized gene expression on cells RNA UMAP' xpr_per_cell_rd_atacumap_plot_png: type: @@ -499,15 +508,16 @@ outputs: - type: array items: File outputSource: de_pseudobulk/xpr_per_cell_rd_atacumap_plot_png - label: "Log normalized gene expression on cells ATAC UMAP per dataset" + label: "Log normalized gene expression on cells ATAC UMAP" doc: | - Log normalized gene expression on cells UMAP per dataset optionally subsetted - to the specific cluster or cell type (atacumap dim. reduction). + Log normalized gene expression on cells UMAP + split by selected criteria, optionally subsetted + to the specific group (atacumap dim. reduction). PNG format 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression on cells ATAC UMAP per dataset' + tab: 'Gene expression ATAC' + Caption: 'Log normalized gene expression on cells ATAC UMAP' xpr_per_cell_rd_wnnumap_plot_png: type: @@ -515,58 +525,70 @@ outputs: - type: array items: File outputSource: de_pseudobulk/xpr_per_cell_rd_wnnumap_plot_png - label: "Log normalized gene expression on cells WNN UMAP per dataset" + label: "Log normalized gene expression on cells WNN UMAP" doc: | - Log normalized gene expression on cells UMAP per dataset optionally subsetted - to the specific cluster or cell type (wnnumap dim. reduction). + Log normalized gene expression on cells UMAP + split by selected criteria, optionally subsetted + to the specific group (wnnumap dim. reduction). PNG format 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression on cells WNN UMAP per dataset' + tab: 'Gene expression WNN' + Caption: 'Log normalized gene expression on cells WNN UMAP' xpr_htmp_plot_png: type: File? outputSource: de_pseudobulk/xpr_htmp_plot_png - label: "Log normalized gene expression heatmap per dataset" + label: "Filtered by adjusted P-value normalized gene expression heatmap" doc: | - Normalized gene expression heatmap optionally subsetted - to the specific cluster or cell type. + Filtered by adjusted P-value normalized gene + expression heatmap per cell optionally subsetted + to the specific group. PNG format 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Normalized gene expression heatmap' + tab: 'Gene expression' + Caption: 'Filtered by adjusted P-value normalized gene expression heatmap' diff_expr_genes: type: File outputSource: de_pseudobulk/diff_expr_genes - label: "Differentially expressed genes" + label: "Differentially expressed genes. Not filtered" doc: | - Differentially expressed genes. + Differentially expressed genes. Not filtered + by adjusted P-value. TSV format 'sd:visualPlugins': - syncfusiongrid: tab: 'Diff expressed genes' - Title: 'Differentially expressed genes' + Title: 'Differentially expressed genes. Not filtered' read_counts_file: - type: File - outputSource: de_pseudobulk/read_counts_gct - label: "GSEA compatible normalized counts" + type: File? + outputSource: de_pseudobulk/bulk_read_counts_gct + label: "GSEA compatible not filtered normalized reads counts" doc: | - GSEA compatible normalized counts, optionally, batch corrected. + GSEA compatible not filtered normalized reads + counts aggregated to pseudobulk form. GCT format phenotypes_file: - type: File - outputSource: de_pseudobulk/phenotypes_cls + type: File? + outputSource: de_pseudobulk/bulk_phenotypes_cls label: "GSEA compatible phenotypes file" doc: | - GSEA compatible phenotypes file defined based on --splitby, --first, - and --second parameters. + GSEA compatible phenotypes file defined based + on --splitby, --first, and --second parameters. CLS format + cell_read_counts_gct: + type: File + outputSource: de_pseudobulk/cell_read_counts_gct + label: "Filtered normalized reads counts per cell" + doc: | + Filtered normalized reads counts per cell. + GCT format + de_pseudobulk_stdout_log: type: File outputSource: de_pseudobulk/stdout_log @@ -601,31 +623,38 @@ steps: in: query_data_rds: query_data_rds datasets_metadata: datasets_metadata - splitby: splitby - first_cond: first_cond - second_cond: second_cond - batchby: batchby - groupby: groupby + barcodes_data: barcodes_data + groupby: + source: groupby + valueFrom: $(self==""?null:self) # safety measure subset: source: subset valueFrom: $(split_features(self)) - lrt: lrt + splitby: splitby + first_cond: first_cond + second_cond: second_cond + analysis_method: + source: analysis_method + valueFrom: $(self.split(" ")[0]) + batchby: + source: batchby + valueFrom: $(self==""?null:self) # safety measure maximum_padj: maximum_padj genes_of_interest: source: genes_of_interest valueFrom: $(split_features(self)) - exclude_pattern: exclude_pattern - normalization_method: normalization_method - remove: remove + exclude_pattern: + source: exclude_pattern + valueFrom: $(self==""?null:self) # safety measure cluster_method: source: cluster_method valueFrom: $(self=="none"?null:self) row_distance: row_distance column_distance: column_distance center_row: center_row + color_theme: color_theme verbose: default: true - color_theme: color_theme parallel_memory_limit: source: parallel_memory_limit valueFrom: $(parseInt(self)) @@ -649,15 +678,16 @@ steps: - xpr_per_cell_rd_wnnumap_plot_png - xpr_htmp_plot_png - diff_expr_genes - - read_counts_gct - - phenotypes_cls + - bulk_read_counts_gct + - bulk_phenotypes_cls + - cell_read_counts_gct - stdout_log - stderr_log morpheus_heatmap: run: ../tools/morpheus-heatmap.cwl in: - read_counts_gct: de_pseudobulk/read_counts_gct + read_counts_gct: de_pseudobulk/cell_read_counts_gct out: - heatmap_html - stdout_log @@ -677,21 +707,6 @@ steps: - html_data - html_file - make_ma_plot: - run: ../tools/ma-plot.cwl - in: - diff_expr_file: de_pseudobulk/diff_expr_genes - x_axis_column: - default: "baseMean" - y_axis_column: - default: "log2FoldChange" - label_column: - default: "gene" - out: - - html_data - - html_file - - $namespaces: s: http://schema.org/ @@ -741,4 +756,5 @@ s:creator: doc: | Single-cell Pseudobulk Differential Expression Analysis Between Datasets - Identifies differentially expressed genes between groups of cells coerced to pseudobulk datasets. \ No newline at end of file + Identifies differentially expressed genes between groups of cells + coerced to pseudobulk datasets. \ No newline at end of file diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index 86be6008..b0a94dcf 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -19,6 +19,7 @@ requirements: - "cellranger-aggr.cwl" - "single-cell-preprocess-cellranger.cwl" - "cellranger-multi.cwl" + - "sc-format-transform.cwl" inputs: @@ -140,6 +141,40 @@ inputs: 'sd:layout': advanced: true + remove_doublets: + type: boolean? + default: false + label: "Remove cells that were identified as doublets" + doc: | + Remove cells that were identified as doublets. Cells with + RNA UMI < 200 will not be evaluated. Default: do not remove + doublets + 'sd:layout': + advanced: true + + rna_doublet_rate: + type: float? + default: null + label: "Expected RNA doublet rate" + doc: | + Expected RNA doublet rate. Default: 1 percent per + thousand cells captured with 10x genomics + 'sd:layout': + advanced: true + + rna_doublet_rate_sd: + type: float? + default: null + label: "Uncertainty range in the RNA doublet rate" + doc: | + Uncertainty range in the RNA doublet rate, interpreted as + a +/- around the value provided in --rnadbr. Set to 0 to + disable. Set to 1 to make the threshold depend entirely + on the misclassification rate. Default: 40 percents of the + value provided in --rnadbr + 'sd:layout': + advanced: true + color_theme: type: - "null" @@ -316,6 +351,18 @@ outputs: tab: 'Not filtered QC' Caption: 'QC metrics per cell density' + raw_rnadbl_plot_png: + type: File? + outputSource: sc_rna_filter/raw_rnadbl_plot_png + label: "Percentage of RNA doublets per dataset (not filtered)" + doc: | + Percentage of RNA doublets per dataset (not filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Not filtered QC' + Caption: 'Percentage of RNA doublets per dataset' + raw_umi_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/raw_umi_dnst_spl_cnd_plot_png @@ -473,6 +520,18 @@ outputs: tab: 'Filtered QC' Caption: 'QC metrics per cell density' + fltr_rnadbl_plot_png: + type: File? + outputSource: sc_rna_filter/fltr_rnadbl_plot_png + label: "Percentage of RNA doublets per dataset (filtered)" + doc: | + Percentage of RNA doublets per dataset (filtered). + PNG format + 'sd:visualPlugins': + - image: + tab: 'Filtered QC' + Caption: 'Percentage of RNA doublets per dataset' + fltr_umi_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/fltr_umi_dnst_spl_cnd_plot_png @@ -605,6 +664,13 @@ steps: valueFrom: $(split_numbers(self)) mito_pattern: mito_pattern maximum_mito_perc: maximum_mito_perc + remove_doublets: remove_doublets + rna_doublet_rate: + source: rna_doublet_rate + valueFrom: $(self==""?null:self) # safety measure + rna_doublet_rate_sd: + source: rna_doublet_rate_sd + valueFrom: $(self==""?null:self) # safety measure verbose: default: true export_ucsc_cb: @@ -629,6 +695,7 @@ steps: - raw_mito_dnst_plot_png - raw_nvlt_dnst_plot_png - raw_qc_mtrcs_dnst_plot_png + - raw_rnadbl_plot_png - raw_umi_dnst_spl_cnd_plot_png - raw_gene_dnst_spl_cnd_plot_png - raw_mito_dnst_spl_cnd_plot_png @@ -642,6 +709,7 @@ steps: - fltr_mito_dnst_plot_png - fltr_nvlt_dnst_plot_png - fltr_qc_mtrcs_dnst_plot_png + - fltr_rnadbl_plot_png - fltr_umi_dnst_spl_cnd_plot_png - fltr_gene_dnst_spl_cnd_plot_png - fltr_mito_dnst_spl_cnd_plot_png @@ -708,5 +776,5 @@ s:creator: doc: | Single-cell RNA-Seq Filtering Analysis - + Filters single-cell RNA-Seq datasets based on the common QC metrics. \ No newline at end of file diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 33fc5adb..527740a0 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -313,11 +313,11 @@ inputs: - type: enum symbols: - "1" + - "2" default: "1" label: "Number of cores/cpus to use" doc: | Number of cores/cpus to use - Forced to 1 'sd:layout': advanced: true @@ -671,5 +671,4 @@ s:creator: doc: | Single-cell RNA-Seq Dimensionality Reduction Analysis - Integrates multiple single-cell RNA-Seq datasets, reduces - dimensionality using PCA. \ No newline at end of file + Integrates multiple single-cell RNA-Seq datasets, reduces dimensionality using PCA. \ No newline at end of file diff --git a/workflows/sc-triangulate.cwl b/workflows/sc-triangulate.cwl index f47c072a..21b93b73 100644 --- a/workflows/sc-triangulate.cwl +++ b/workflows/sc-triangulate.cwl @@ -400,5 +400,4 @@ s:creator: doc: | Single-cell Label Integration Analysis - Harmonizes conflicting annotations in single-cell - genomics studies. \ No newline at end of file + Harmonizes conflicting annotations in single-cell genomics studies. \ No newline at end of file diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index 50824cd1..db5ffb17 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -621,6 +621,15 @@ outputs: doc: | Processed Seurat data in RDS format + seurat_data_scope: + type: File? + outputSource: sc_wnn_cluster/seurat_data_scope + label: "Processed Seurat data in SCope compatible loom format" + doc: | + Processed Seurat data in SCope compatible loom format. + Only not normalized raw counts from the RNA assay will + be saved + sc_wnn_cluster_stdout_log: type: File outputSource: sc_wnn_cluster/stdout_log @@ -674,6 +683,8 @@ steps: default: true export_ucsc_cb: default: true + export_scope_data: + default: true color_theme: color_theme parallel_memory_limit: source: parallel_memory_limit @@ -707,6 +718,7 @@ steps: - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds + - seurat_data_scope - stdout_log - stderr_log @@ -766,5 +778,5 @@ s:creator: doc: | Single-cell WNN Cluster Analysis - Clusters multiome ATAC and RNA-Seq datasets, identifies gene - markers and differentially accessible peaks. \ No newline at end of file + Clusters multiome ATAC and RNA-Seq datasets, identifies gene markers + and differentially accessible peaks. \ No newline at end of file From df5c5e414993114f621a52837bd274ff4545d6a4 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 23 May 2023 14:45:10 -0400 Subject: [PATCH 032/162] Remove unused tool --- tools/sc-split-atac.cwl | 147 ---------------------------------------- 1 file changed, 147 deletions(-) delete mode 100644 tools/sc-split-atac.cwl diff --git a/tools/sc-split-atac.cwl b/tools/sc-split-atac.cwl deleted file mode 100644 index 73145a8c..00000000 --- a/tools/sc-split-atac.cwl +++ /dev/null @@ -1,147 +0,0 @@ -cwlVersion: v1.0 -class: CommandLineTool - - -requirements: -- class: InlineJavascriptRequirement - - -hints: -- class: DockerRequirement - dockerPull: biowardrobe2/sc-split-atac:v0.0.1 - - -inputs: - - atac_fragments_file: - type: File - inputBinding: - prefix: "--fragments" - doc: | - Path to GZIP compressed TSV file with ATAC fragments (from Cell Ranger ARC) - - clusters_metadata: - type: File - inputBinding: - prefix: "--clusters" - doc: | - Path to headerless TSV file with barcodes (first column) and - clusters (second column) - - log_level: - type: - - "null" - - type: enum - symbols: - - "fatal" - - "error" - - "warning" - - "info" - - "debug" - inputBinding: - prefix: "--loglevel" - doc: | - Logging level. - Default: info - - output_prefix: - type: string? - inputBinding: - prefix: "--output" - doc: | - Output file prefix. - Default: ./split - - threads: - type: int? - inputBinding: - prefix: "--cpus" - doc: | - Number of processes to run in parallel. - Default: 1 - - -outputs: - - atac_fragments_per_cluster_file: - type: File[] - outputBinding: - glob: "*.bed" - - stdout_log: - type: stdout - - stderr_log: - type: stderr - - -baseCommand: ["sc_split_atac.py"] - -stdout: sc_split_atac_stdout.log -stderr: sc_split_atac_stderr.log - - -$namespaces: - s: http://schema.org/ - -$schemas: -- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf - - -label: "Single-cell Split ATAC Fragments" -s:name: "Single-cell Split ATAC Fragments" -s:alternateName: "Splits scATAC fragments produced by Cell Ranger ARC Count/Aggregate pipelines" - -s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-split-atac.cwl -s:codeRepository: https://github.com/Barski-lab/workflows -s:license: http://www.apache.org/licenses/LICENSE-2.0 - -s:isPartOf: - class: s:CreativeWork - s:name: Common Workflow Language - s:url: http://commonwl.org/ - -s:creator: -- class: s:Organization - s:legalName: "Cincinnati Children's Hospital Medical Center" - s:location: - - class: s:PostalAddress - s:addressCountry: "USA" - s:addressLocality: "Cincinnati" - s:addressRegion: "OH" - s:postalCode: "45229" - s:streetAddress: "3333 Burnet Ave" - s:telephone: "+1(513)636-4200" - s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" - s:department: - - class: s:Organization - s:legalName: "Allergy and Immunology" - s:department: - - class: s:Organization - s:legalName: "Barski Research Lab" - s:member: - - class: s:Person - s:name: Michael Kotliar - s:email: mailto:misha.kotliar@gmail.com - s:sameAs: - - id: http://orcid.org/0000-0002-6486-3898 - - -doc: | - Single-cell Split ATAC Fragments - ============================================================================= - Splits scATAC fragments produced by Cell Ranger ARC Count/Aggregate pipelines - - -s:about: | - usage: sc_split_atac.py [-h] --fragments FRAGMENTS --clusters CLUSTERS [--cpus CPUS] [--loglevel {fatal,error,warning,info,debug}] [--output OUTPUT] - - optional arguments: - -h, --help show this help message and exit - --fragments FRAGMENTS - Path to GZIP compressed TSV file with ATAC fragments (from Cell Ranger ARC) - --clusters CLUSTERS Path to headerless TSV file with barcodes (first column) and clusters (second column) - --cpus CPUS Number of processes to run in parallel - --loglevel {fatal,error,warning,info,debug} - Logging level. Default: info - --output OUTPUT Output file prefix From e847426b6e26eb70eaec808674f93b8cfca7e974 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 24 May 2023 22:38:42 -0400 Subject: [PATCH 033/162] Add gene name to the peak label on Volcano Plot --- workflows/sc-atac-dbinding.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/sc-atac-dbinding.cwl b/workflows/sc-atac-dbinding.cwl index 54095eee..d6aff015 100644 --- a/workflows/sc-atac-dbinding.cwl +++ b/workflows/sc-atac-dbinding.cwl @@ -708,7 +708,7 @@ steps: default: | HEADER=`head -n 1 $0`; echo -e "label\t${HEADER}" > diff_sts_labeled.tsv; - cat "$0" | grep -v "start" | awk -F "\t" '{print $1":"$2"-"$3"\t"$0}' >> diff_sts_labeled.tsv + cat "$0" | grep -v "start" | awk -F "\t" '{print $1":"$2"-"$3"-"$NF"\t"$0}' >> diff_sts_labeled.tsv out: - output_file From 9b8ad0e16689ad44284038efbf3724c94925ee11 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 26 May 2023 13:08:21 -0400 Subject: [PATCH 034/162] Use the latest docker image for FASTQ download Shows worning for missing SRR identifiers --- tools/fastq-dump.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/fastq-dump.cwl b/tools/fastq-dump.cwl index 351bca4e..9bded3bb 100644 --- a/tools/fastq-dump.cwl +++ b/tools/fastq-dump.cwl @@ -10,7 +10,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/fastqdwnld:v0.0.1 + dockerPull: biowardrobe2/fastqdwnld:v0.0.2 inputs: From f05602ee6e45d58f395e4e2f738aa388d4306622 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 31 May 2023 15:27:35 -0400 Subject: [PATCH 035/162] Select blacklist regions and cell cycle genes based on genome type --- tools/sc-atac-cluster.cwl | 2 +- tools/sc-atac-coverage.cwl | 2 +- tools/sc-atac-dbinding.cwl | 2 +- tools/sc-atac-reduce.cwl | 2 +- tools/sc-ctype-assign.cwl | 2 +- tools/sc-multiome-filter.cwl | 27 +++++++++++++++++++++++++-- tools/sc-rna-cluster.cwl | 2 +- tools/sc-rna-da-cells.cwl | 2 +- tools/sc-rna-de-pseudobulk.cwl | 2 +- tools/sc-rna-filter.cwl | 2 +- tools/sc-rna-reduce.cwl | 28 +++++++++++++++++++++++++--- tools/sc-triangulate.cwl | 2 +- tools/sc-wnn-cluster.cwl | 2 +- workflows/sc-multiome-filter.cwl | 23 +++++++++++++++++++---- workflows/sc-rna-reduce.cwl | 24 ++++++++++++++---------- 15 files changed, 94 insertions(+), 30 deletions(-) diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index 6684d650..13e2aaba 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.21 + dockerPull: biowardrobe2/sc-tools:v0.0.22 inputs: diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index 4bb7e991..87bcf337 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.21 + dockerPull: biowardrobe2/sc-tools:v0.0.22 inputs: diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index 7f6c49db..5599ddf8 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.21 + dockerPull: biowardrobe2/sc-tools:v0.0.22 inputs: diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index f803bbd1..bca8d6f3 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.21 + dockerPull: biowardrobe2/sc-tools:v0.0.22 inputs: diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index d59cfcb9..1df1ec99 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.21 + dockerPull: biowardrobe2/sc-tools:v0.0.22 inputs: diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index bdb9adb2..15672a5d 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.21 + dockerPull: biowardrobe2/sc-tools:v0.0.22 inputs: @@ -74,11 +74,34 @@ inputs: Default: each dataset is assigned to its own group. blacklist_regions_file: - type: File? + type: + - "null" + - File + - type: enum + symbols: + - "hg19" + - "hg38" + - "mm10" inputBinding: prefix: "--blacklist" + valueFrom: | + ${ + if (self.class && self.class == "File"){ + return self; + } else if (self == "hg19") { + return "/opt/sc_tools/hg19-blacklist.v2.bed"; + } else if (self == "hg38") { + return "/opt/sc_tools/hg38-blacklist.v2.bed"; + } else if (self == "mm10") { + return "/opt/sc_tools/mm10-blacklist.v2.bed"; + } else { + return null; + } + } doc: | Path to the optional BED file with the genomic blacklist regions. + If a string value provided, it should be one of the hg19, hg38, + or mm10 as we replace it with the file location from docker image barcodes_data: type: File? diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index b35a10f6..d75ee916 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.21 + dockerPull: biowardrobe2/sc-tools:v0.0.22 inputs: diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index 47e7f1de..3701ef9c 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.21 + dockerPull: biowardrobe2/sc-tools:v0.0.22 inputs: diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index 78efbe58..be87a1c6 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.21 + dockerPull: biowardrobe2/sc-tools:v0.0.22 inputs: diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index 7d62dafc..8c05b295 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.21 + dockerPull: biowardrobe2/sc-tools:v0.0.22 inputs: diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index 33124f42..00c3d1d3 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.21 + dockerPull: biowardrobe2/sc-tools:v0.0.22 inputs: @@ -52,14 +52,36 @@ inputs: Default: all cells used, no extra metadata is added cell_cycle_data: - type: File? + type: + - "null" + - File + - type: enum + symbols: + - "hg19" + - "hg38" + - "mm10" inputBinding: prefix: "--cellcycle" + valueFrom: | + ${ + if (self.class && self.class == "File"){ + return self; + } else if (self == "hg19") { + return "/opt/sc_tools/human_cc_genes.csv"; + } else if (self == "hg38") { + return "/opt/sc_tools/human_cc_genes.csv"; + } else if (self == "mm10") { + return "/opt/sc_tools/mouse_cc_genes.csv"; + } else { + return null; + } + } doc: | Path to the TSV/CSV file with the information for cell cycle score assignment. First column - 'phase', second column 'gene_id'. If loaded Seurat object already includes cell cycle scores in 'S.Score', 'G2M.Score', and 'CC.Difference' metatada - columns they will be overwritten. + columns they will be overwritten. If a string value provided, it should be one of + the hg19, hg38, or mm10 as we replace it with the file location from docker image. Default: skip cell cycle score assignment. normalization_method: diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index 61fb9723..304828a2 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.21 + dockerPull: biowardrobe2/sc-tools:v0.0.22 inputs: diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index 861bd510..01617d62 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.21 + dockerPull: biowardrobe2/sc-tools:v0.0.22 inputs: diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index 8eee63cb..4a94ebd5 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -84,10 +84,22 @@ inputs: Default: each dataset is assigned to its own group. blacklist_regions_file: - type: File? - label: "Optional BED file with the genomic blacklist regions" - doc: | - Path to the optional BED file with the genomic blacklist regions. + type: + - "null" # although, we allow it to be optional, all our upstreams should have "genome" input + - type: enum + symbols: + - "hg19" + - "hg38" + - "mm10" + label: "Genome type for genomic blacklist regions selection" + doc: | + One of the hg19/hg38/mm10 values to automatically + select file with blacklisted genomic regions. If + not provided, filtering by the maximum fraction of + fragments in genomic blacklist regions won't be + applied. + 'sd:upstreamSource': "sc_arc_sample/genome_indices/genome_indices/genome" + 'sd:localLabel': true barcodes_data: type: File? @@ -375,6 +387,9 @@ inputs: - type: enum symbols: - "1" + - "2" + - "3" + - "4" default: "1" label: "Number of cores/cpus to use" doc: | diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 527740a0..35330d6d 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -42,6 +42,20 @@ inputs: 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" 'sd:localLabel': true + cell_cycle_data: + type: + - "null" + - type: enum + symbols: + - "hg19" + - "hg38" + - "mm10" + label: "Genome type for cell cycle genes selection" + doc: | + Genome type to use for cell cycle score + assignment. If not provided, cell cycle + scores won't be assigned. + datasets_metadata: type: File? label: "Path to the TSV/CSV file to optionally extend Seurat object metadata with categorical values" @@ -67,16 +81,6 @@ inputs: those are present. Default: all cells used, no extra metadata is added - cell_cycle_data: - type: File? - label: "Optional TSV/CSV file with cell cycle data. First column - 'phase', second column 'gene_id'" - doc: | - Path to the TSV/CSV file with the information for cell cycle score assignment. - First column - 'phase', second column 'gene_id'. If loaded Seurat object already - includes cell cycle scores in 'S.Score', 'G2M.Score', and 'CC.Difference' metatada - columns they will be overwritten. - Default: skip cell cycle score assignment. - dimensions: type: int? label: "Dimensionality to use in UMAP projection (from 1 to 50)" From 8fb416d1f9748ef8dc1278a22568006e2b4a745b Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 2 Jun 2023 17:27:29 -0400 Subject: [PATCH 036/162] Update DESeq pipeline to use the latest docker Should fix the problem with non-unique RefseqIds --- tools/deseq-multi-factor.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/deseq-multi-factor.cwl b/tools/deseq-multi-factor.cwl index 61bbfc89..d4022eb1 100644 --- a/tools/deseq-multi-factor.cwl +++ b/tools/deseq-multi-factor.cwl @@ -8,7 +8,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/deseq:v0.0.3 + dockerPull: biowardrobe2/deseq:v0.0.4 inputs: From aaff3d9c9a1a74daf6c59d184e08e66e779eab25 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 5 Jun 2023 12:23:52 -0400 Subject: [PATCH 037/162] Refactor Multiome Filtering pipeline --- workflows/sc-multiome-filter.cwl | 1641 ++++++++++++------------------ 1 file changed, 659 insertions(+), 982 deletions(-) diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index 4a94ebd5..5b9a4956 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -14,7 +14,7 @@ requirements: }; -'sd:upstream': +"sd:upstream": sc_arc_sample: - "cellranger-arc-count.cwl" - "cellranger-arc-aggr.cwl" @@ -24,310 +24,360 @@ inputs: alias: type: string - label: "Experiment short name/alias" + label: "Analysis name" sd:preview: position: 1 filtered_feature_bc_matrix_folder: type: File - label: "Cell Ranger ARC Count/Aggregate Experiment" + label: "Cell Ranger ARC Sample" doc: | - Path to the compressed folder with feature-barcode matrix from Cell Ranger ARC Count/Aggregate - experiment in MEX format. The rows consist of all the genes and peaks concatenated - together and the columns are restricted to those barcodes that are identified as cells. - 'sd:upstreamSource': "sc_arc_sample/filtered_feature_bc_matrix_folder" - 'sd:localLabel': true - - aggregation_metadata: - type: File? - label: "Cell Ranger ARC Count/Aggregate Experiment" - doc: | - Path to the metadata TSV/CSV file to set the datasets identities. If '--mex' points to - the Cell Ranger ARC Aggregate outputs, the aggr.csv file can be used. If input is not - provided, the default dummy_metadata.csv will be used instead. - 'sd:upstreamSource': "sc_arc_sample/aggregation_metadata" + Any "Cell Ranger ARC Sample" that produces + compressed folder with feature-barcode + matrix in MEX format, ATAC fragments file + in TSV format, and optional aggregation + metadata file in TSV/CSV format. + "sd:upstreamSource": "sc_arc_sample/filtered_feature_bc_matrix_folder" + "sd:localLabel": true atac_fragments_file: type: File secondaryFiles: - .tbi - label: "Cell Ranger ARC Count/Aggregate Experiment" - doc: | - Count and barcode information for every ATAC fragment observed in the experiment in TSV - format. Tbi-index file is required. - 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" + "sd:upstreamSource": "sc_arc_sample/atac_fragments_file" + + aggregation_metadata: + type: File? + "sd:upstreamSource": "sc_arc_sample/aggregation_metadata" annotation_gtf_file: type: File - label: "Cell Ranger ARC Count/Aggregate Experiment" - doc: | - Path to the genome annotation file in GTF format. - 'sd:upstreamSource': "sc_arc_sample/genome_indices/genome_indices/annotation_gtf" - 'sd:localLabel': true + "sd:upstreamSource": "sc_arc_sample/genome_indices/genome_indices/annotation_gtf" chrom_length_file: type: File - label: "Cell Ranger ARC Count/Aggregate Experiment" - doc: | - Chromosome length file in TSV format - 'sd:upstreamSource': "sc_arc_sample/genome_indices/chrom_length_file" - 'sd:localLabel': true - - grouping_data: - type: File? - label: "Optional TSV/CSV file to define datasets grouping with 'library_id' and 'condition' columns. Rows order should correspond to the aggregation metadata." - doc: | - Path to the TSV/CSV file to define datasets grouping. - First column - 'library_id' with the values and order - that correspond to the 'library_id' column from the ' - --identity' file, second column 'condition'. - Default: each dataset is assigned to its own group. + "sd:upstreamSource": "sc_arc_sample/genome_indices/chrom_length_file" blacklist_regions_file: type: - - "null" # although, we allow it to be optional, all our upstreams should have "genome" input + - "null" - type: enum symbols: - "hg19" - "hg38" - "mm10" - label: "Genome type for genomic blacklist regions selection" - doc: | - One of the hg19/hg38/mm10 values to automatically - select file with blacklisted genomic regions. If - not provided, filtering by the maximum fraction of - fragments in genomic blacklist regions won't be - applied. - 'sd:upstreamSource': "sc_arc_sample/genome_indices/genome_indices/genome" - 'sd:localLabel': true + "sd:upstreamSource": "sc_arc_sample/genome_indices/genome_indices/genome" + + grouping_data: + type: File? + label: "Datasets grouping (optional)" + doc: | + If the selected "Cell Ranger ARC + Sample" includes multiple aggregated + datasets, each dataset can be assigned + to a separate group by providing a + TSV/CSV file with "library_id" and + "condition" columns. Obtain this file + from the "aggregation_metadata.csv" + output generated by "Cell Ranger ARC + Sample" and accessible on the "Files" + tab. Remove all columns except the + "library_id". Add the group names for + each dataset in a separate column + named "condition". barcodes_data: type: File? - label: "Optional TSV/CSV file to prefilter and extend metadata be barcodes. First column should be named as 'barcode'" + label: "Selected cell barcodes (optional)" doc: | - Path to the TSV/CSV file to optionally prefilter and - extend Seurat object metadata be selected barcodes. - First column should be named as 'barcode'. If file - includes any other columns they will be added to the - Seurat object metadata ovewriting the existing ones if - those are present. - Default: all cells used, no extra metadata is added + A TSV/CSV file to optionally prefilter + the single cell data by including only + the cells with the selected barcodes. + The provided file should include at + least one column named "barcode", with + one cell barcode per line. All other + columns, except for "barcode", will be + added to the single cell metadata loaded + from "Cell Ranger ARC Sample" and can be + utilized in the current or future steps + of analysis. + + remove_doublets: + type: + - type: enum + symbols: + - "Based on either RNA or ATAC" + - "Based on RNA" + - "Based on ATAC" + - "Based on both RNA and ATAC" + - "Do not remove" + default: "Do not remove" + label: "Doublets removal" + doc: | + Quality control filtering parameter + to remove cells identified as doublets. + Depending on the selected option, + doublets can be detected and removed + based on only gene expression or + chromatin accessibility data, or their + combination (union or intersection). + Default: do not remove + "sd:layout": + advanced: true + + rna_minimum_umi: + type: string? + default: "500" + label: "Minimum number of RNA UMI counts per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the number of RNA UMI + counts smaller than the provided value. + If the selected "Cell Ranger ARC + Sample" includes multiple aggregated + datasets, each of them can be filtered + independently by providing comma or + space-separated list of filtering + thresholds. The order and number of + the specified values need to match + with the datasets order from the + "aggregation_metadata.csv" output + generated by "Cell Ranger ARC Sample" + and accessible on the "Files" tab. + Default: 500 + "sd:layout": + advanced: true minimum_genes: type: string? default: "250" - label: "Include cells where at least this many genes are detected" - doc: | - Include cells where at least this many genes are detected. If multiple values - provided, each of them will be applied to the correspondent dataset from the - '--mex' input based on the '--identity' file. - Default: 250 (applied to all datasets) - 'sd:layout': + label: "Minimum number of genes per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the number of expressed + genes smaller than the provided value. + If the selected "Cell Ranger ARC + Sample" includes multiple aggregated + datasets, each of them can be filtered + independently by providing comma or + space-separated list of filtering + thresholds. The order and number of + the specified values need to match + with the datasets order from the + "aggregation_metadata.csv" output + generated by "Cell Ranger ARC Sample" + and accessible on the "Files" tab. + Default: 250 + "sd:layout": advanced: true maximum_genes: type: string? default: "5000" - label: "Include cells with the number of genes not bigger than this value" - doc: | - Include cells with the number of genes not bigger than this value. If multiple - values provided, each of them will be applied to the correspondent dataset from - the '--mex' input based on the '--identity' file. - Default: 5000 (applied to all datasets) - 'sd:layout': - advanced: true - - rna_minimum_umi: - type: string? - default: "500" - label: "Include cells where at least this many UMI (RNA transcripts) are detected" - doc: | - Include cells where at least this many UMI (RNA transcripts) are detected. - If multiple values provided, each of them will be applied to the correspondent - dataset from the '--mex' input based on the '--identity' file. - Default: 500 (applied to all datasets) - 'sd:layout': + label: "Maximum number of genes per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the number of expressed + genes bigger than the provided value. + If the selected "Cell Ranger ARC + Sample" includes multiple aggregated + datasets, each of them can be filtered + independently by providing comma or + space-separated list of filtering + thresholds. The order and number of + the specified values need to match + with the datasets order from the + "aggregation_metadata.csv" output + generated by "Cell Ranger ARC Sample" + and accessible on the "Files" tab. + Default: 5000 + "sd:layout": advanced: true mito_pattern: type: string? default: "^mt-|^MT-" - label: "Regex pattern to identify mitochondrial genes" + label: "Mitochondrial genes pattern" doc: | - Regex pattern to identify mitochondrial genes. - Default: '^mt-|^MT-' - 'sd:layout': + Regex pattern to identify mitochondrial + genes based on their names. + Default: "^mt-|^MT-" + "sd:layout": advanced: true maximum_mito_perc: type: float? default: 5 - label: "Include cells with the percentage of transcripts mapped to mitochondrial genes not bigger than this value" - doc: | - Include cells with the percentage of transcripts mapped to mitochondrial - genes not bigger than this value. - Default: 5 (applied to all datasets) - 'sd:layout': + label: "Maximum mitochondrial percentage per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the percentage of + transcripts mapped to mitochondrial + genes exceeding the provided value. + Default: 5 + "sd:layout": advanced: true minimum_novelty_score: type: string? default: "0.8" - label: "Include cells with the novelty score not lower than this value, calculated as log10(genes)/log10(UMI) for RNA assay" - doc: | - Include cells with the novelty score not lower than this value, calculated - as log10(genes)/log10(UMI) for RNA assay. If multiple values provided, each of them will - be applied to the correspondent dataset from the '--mex' input based on the - '--identity' file. - Default: 0.8 (applied to all datasets) - 'sd:layout': + label: "Minimum novelty score per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the novelty scores + smaller than the provided value. + This QC metrics indicates the overall + transcriptomic dissimilarity of the + cells and is calculated as the ratio + of log10(Genes) to log10(RNA UMI). + If the selected "Cell Ranger ARC + Sample" includes multiple aggregated + datasets, each of them can be filtered + independently by providing comma or + space-separated list of filtering + thresholds. The order and number of + the specified values need to match + with the datasets order from the + "aggregation_metadata.csv" output + generated by "Cell Ranger ARC Sample" + and accessible on the "Files" tab. + Default: 0.8 + "sd:layout": advanced: true atac_minimum_umi: type: string? default: "1000" - label: "Include cells where at least this many UMI (ATAC transcripts) are detected" - doc: | - Include cells where at least this many UMI (ATAC transcripts) are detected. - If multiple values provided, each of them will be applied to the correspondent - dataset from the '--mex' input based on the '--identity' file. - Default: 1000 (applied to all datasets) - 'sd:layout': - advanced: true - - maximum_nucl_signal: - type: string? - default: "4" - label: "Include cells with the nucleosome signal not bigger than this value" - doc: | - Include cells with the nucleosome signal not bigger than this value. - Nucleosome signal quantifies the approximate ratio of mononucleosomal - to nucleosome-free fragments. If multiple values provided, each of - them will be applied to the correspondent dataset from the '--mex' input - based on the '--identity' file. - Default: 4 (applied to all datasets) - 'sd:layout': + label: "Minimum number of ATAC UMI counts per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the number of ATAC UMI + counts smaller than the provided value. + If the selected "Cell Ranger ARC + Sample" includes multiple aggregated + datasets, each of them can be filtered + independently by providing comma or + space-separated list of filtering + thresholds. The order and number of + the specified values need to match + with the datasets order from the + "aggregation_metadata.csv" output + generated by "Cell Ranger ARC Sample" + and accessible on the "Files" tab. + Default: 1000 + "sd:layout": advanced: true minimum_tss_enrich: type: string? default: "2" - label: "Include cells with the TSS enrichment score not lower than this value" - doc: | - Include cells with the TSS enrichment score not lower than this value. - Score is calculated based on the ratio of fragments centered at the TSS - to fragments in TSS-flanking regions. If multiple values provided, each - of them will be applied to the correspondent dataset from the '--mex' input - based on the '--identity' file. - Default: 2 (applied to all datasets) - 'sd:layout': + label: "Minimum TSS enrichment score per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the TSS enrichment score + smaller than the provided value. + This QC metrics is calculated based + on the ratio of ATAC fragments + centered at the genes TSS to ATAC + fragments in the TSS-flanking regions. + If the selected "Cell Ranger ARC + Sample" includes multiple aggregated + datasets, each of them can be filtered + independently by providing comma or + space-separated list of filtering + thresholds. The order and number of + the specified values need to match + with the datasets order from the + "aggregation_metadata.csv" output + generated by "Cell Ranger ARC Sample" + and accessible on the "Files" tab. + Default: 2 + "sd:layout": advanced: true minimum_frip: type: string? default: "0.15" - label: "Include cells with the FRiP not lower than this value" - doc: | - Include cells with the FRiP not lower than this value. If multiple values - provided, each of them will be applied to the correspondent dataset from the - '--mex' input based on the '--identity' file. FRiP is calculated for fragments. - Default: 0.15 (applied to all datasets) - 'sd:layout': + label: "Minimum FRiP per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the FRiP (Fraction of + Reads in Peaks) smaller than the + provided value. + If the selected "Cell Ranger ARC + Sample" includes multiple aggregated + datasets, each of them can be filtered + independently by providing comma or + space-separated list of filtering + thresholds. The order and number of + the specified values need to match + with the datasets order from the + "aggregation_metadata.csv" output + generated by "Cell Ranger ARC Sample" + and accessible on the "Files" tab. + Default: 0.15 + "sd:layout": advanced: true - maximum_blacklist_fraction: + maximum_nucl_signal: type: string? - default: "0.05" - label: "Include cells with the fraction of fragments in genomic blacklist regions not bigger than this value" - doc: | - Include cells with the fraction of fragments in - genomic blacklist regions not bigger than this value. - If multiple values provided, each of them will be - applied to the correspondent dataset from the '--mex' - input based on the '--identity' file. - Default: 0.05 (applied to all datasets) - 'sd:layout': + default: "4" + label: "Maximum nucleosome signal per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the nucleosome signal + higher than the provided value. + Nucleosome signal is a measurement + of nucleosome occupancy. It quantifies + the approximate ratio of mononucleosomal + to nucleosome-free ATAC fragments. + If the selected "Cell Ranger ARC + Sample" includes multiple aggregated + datasets, each of them can be filtered + independently by providing comma or + space-separated list of filtering + thresholds. The order and number of + the specified values need to match + with the datasets order from the + "aggregation_metadata.csv" output + generated by "Cell Ranger ARC Sample" + and accessible on the "Files" tab. + Default: 4 + "sd:layout": advanced: true - call_by: + maximum_blacklist_fraction: type: string? - default: null - label: "Replace Cell Ranger ARC peaks with MACS2 peaks called for cells grouped by selected column" - doc: | - Replace Cell Ranger ARC peaks with MACS2 peaks called - for cells grouped by the column from the optionally - provided --barcodes file. If --barcodes file was not - provided MACS2 peaks can be still called per dataset - by setting --callby to new.ident. Peaks are called - only after applying all RNA related thresholds, - maximum nucleosome signal, and minimum TSS enrichment - scores filters. - Default: do not call peaks - 'sd:layout': - advanced: true - - remove_doublets: - type: - - type: enum - symbols: - - "union" - - "onlyrna" - - "onlyatac" - - "intersect" - - "none" - default: "none" - label: "Remove cells that were identified as doublets" - doc: | - Remove cells that were identified as doublets. For - RNA assay cells with UMI < 200 will not be evaluated. - Default: do not remove doublets - 'sd:layout': - advanced: true - - rna_doublet_rate: - type: float? - default: null - label: "Expected RNA doublet rate" - doc: | - Expected RNA doublet rate. Default: 1 percent per - thousand cells captured with 10x genomics - 'sd:layout': - advanced: true - - rna_doublet_rate_sd: - type: float? - default: null - label: "Uncertainty range in the RNA doublet rate" - doc: | - Uncertainty range in the RNA doublet rate, interpreted as - a +/- around the value provided in --rnadbr. Set to 0 to - disable. Set to 1 to make the threshold depend entirely - on the misclassification rate. Default: 40 percents of the - value provided in --rnadbr - 'sd:layout': - advanced: true - - atac_doublet_rate: - type: float? - default: null - label: "Expected ATAC doublet rate" - doc: | - Expected ATAC doublet rate. Default: 1 percent per thousand - cells captured with 10x genomics - 'sd:layout': - advanced: true - - atac_doublet_rate_sd: - type: float? - default: null - label: "Uncertainty range in the ATAC doublet rate" - doc: | - Uncertainty range in the ATAC doublet rate, interpreted as - a +/- around the value provided in --atacdbr. Set to 0 to - disable. Set to 1 to make the threshold depend entirely - on the misclassification rate. Default: 40 percents of the - value provided in --atacdbr - 'sd:layout': + default: "0.05" + label: "Maximum blacklist fraction per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the fraction of ATAC + fragments in genomic blacklist regions + bigger than the provided value. + If the selected "Cell Ranger ARC + Sample" includes multiple aggregated + datasets, each of them can be filtered + independently by providing comma or + space-separated list of filtering + thresholds. The order and number of + the specified values need to match + with the datasets order from the + "aggregation_metadata.csv" output + generated by "Cell Ranger ARC Sample" + and accessible on the "Files" tab. + Default: 0.05 + "sd:layout": advanced: true color_theme: @@ -344,41 +394,12 @@ inputs: - "classic" - "void" default: "classic" - label: "Color theme for all generated plots" + label: "Plots color theme" doc: | - Color theme for all generated plots. One of gray, bw, linedraw, light, - dark, minimal, classic, void. + Color theme for all plots saved + as PNG files. Default: classic - 'sd:layout': - advanced: true - - parallel_memory_limit: - type: - - "null" - - type: enum - symbols: - - "32" - default: "32" - label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" - doc: | - Maximum memory in GB allowed to be shared between the workers - when using multiple --cpus. - Forced to 32 GB - 'sd:layout': - advanced: true - - vector_memory_limit: - type: - - "null" - - type: enum - symbols: - - "96" - default: "96" - label: "Maximum vector memory in GB allowed to be used by R" - doc: | - Maximum vector memory in GB allowed to be used by R. - Forced to 96 GB - 'sd:layout': + "sd:layout": advanced: true threads: @@ -391,1002 +412,694 @@ inputs: - "3" - "4" default: "1" - label: "Number of cores/cpus to use" + label: "Cores/CPUs number" doc: | - Number of cores/cpus to use. - Forced to 1 - 'sd:layout': + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 1 + "sd:layout": advanced: true outputs: - raw_1_2_qc_mtrcs_pca_plot_png: type: File? outputSource: sc_multiome_filter/raw_1_2_qc_mtrcs_pca_plot_png - label: "PC1 and PC2 from the QC metrics PCA (not filtered)" + label: "QC metrics PCA (1,2), raw" doc: | - PC1 and PC2 from the QC metrics PCA (not filtered). - PNG format - 'sd:visualPlugins': + PC1 and PC2 from the QC metrics + PCA for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'PC1 and PC2 from the QC metrics PCA' + tab: "Raw" + Caption: "QC metrics PCA (1,2)" raw_2_3_qc_mtrcs_pca_plot_png: type: File? outputSource: sc_multiome_filter/raw_2_3_qc_mtrcs_pca_plot_png - label: "PC2 and PC3 from the QC metrics PCA (not filtered)" + label: "QC metrics PCA (2,3), raw" doc: | - PC2 and PC3 from the QC metrics PCA (not filtered). - PNG format - 'sd:visualPlugins': + PC2 and PC3 from the QC metrics + PCA for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'PC2 and PC3 from the QC metrics PCA' + tab: "Raw" + Caption: "QC metrics PCA (2,3)" raw_cells_count_plot_png: type: File? outputSource: sc_multiome_filter/raw_cells_count_plot_png - label: "Number of cells per dataset (not filtered)" + label: "Cells per dataset, raw" doc: | - Number of cells per dataset (not filtered). - PNG format - 'sd:visualPlugins': + Number of cells per dataset + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Number of cells per dataset' + tab: "Raw" + Caption: "Cells per dataset" raw_rna_umi_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_rna_umi_dnst_plot_png - label: "UMI per cell density for RNA assay (not filtered)" + label: "RNA UMI per cell, raw" doc: | - UMI per cell density for RNA assay (not filtered). - PNG format - 'sd:visualPlugins': + RNA UMI per cell density + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'UMI per cell density for RNA assay' + tab: "Raw" + Caption: "RNA UMI per cell" raw_gene_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_gene_dnst_plot_png - label: "Genes per cell density (not filtered)" + label: "Genes per cell, raw" doc: | - Genes per cell density (not filtered). - PNG format - 'sd:visualPlugins': + Genes per cell density + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Genes per cell density' + tab: "Raw" + Caption: "Genes per cell" raw_gene_umi_corr_plot_png: type: File? outputSource: sc_multiome_filter/raw_gene_umi_corr_plot_png - label: "Genes vs UMI per cell correlation for RNA assay (not filtered)" + label: "Genes vs RNA UMI, raw" doc: | - Genes vs UMI per cell correlation for RNA assay (not filtered). - PNG format - 'sd:visualPlugins': + Genes vs RNA UMI per cell + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Genes vs UMI per cell correlation for RNA assay' + tab: "Raw" + Caption: "Genes vs RNA UMI" raw_mito_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_mito_dnst_plot_png - label: "Percentage of transcripts mapped to mitochondrial genes per cell density (not filtered)" + label: "Mitochondrial percentage, raw" doc: | - Percentage of transcripts mapped to mitochondrial genes per cell density (not filtered). - PNG format - 'sd:visualPlugins': + Percentage of transcripts mapped to + mitochondrial genes per cell density + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Percentage of transcripts mapped to mitochondrial genes per cell density' + tab: "Raw" + Caption: "Mitochondrial percentage" raw_nvlt_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_nvlt_dnst_plot_png - label: "Novelty score per cell density for RNA assay (not filtered)" + label: "Novelty score, raw" doc: | - Novelty score per cell density for RNA assay (not filtered). - PNG format - 'sd:visualPlugins': + Novelty score per cell density + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Novelty score per cell density for RNA assay' + tab: "Raw" + Caption: "Novelty score" raw_atac_umi_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_atac_umi_dnst_plot_png - label: "UMI per cell density for ATAC assay (not filtered)" + label: "ATAC UMI per cell, raw" doc: | - UMI per cell density for ATAC assay (not filtered). - PNG format - 'sd:visualPlugins': + ATAC UMI per cell density + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'UMI per cell density for ATAC assay' + tab: "Raw" + Caption: "ATAC UMI per cell" raw_peak_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_peak_dnst_plot_png - label: "Peaks per cell density (not filtered)" + label: "Peaks per cell, raw" doc: | - Peaks per cell density (not filtered). - PNG format - 'sd:visualPlugins': + Peaks per cell density + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Peaks per cell density' + tab: "Raw" + Caption: "Peaks per cell" raw_blck_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_blck_dnst_plot_png - label: "Fraction of ATAC fragments within genomic blacklist regions per cell density (not filtered)" + label: "Blacklist regions fraction, raw" doc: | - Fraction of ATAC fragments within genomic blacklist regions per cell density (not filtered). - PNG format - 'sd:visualPlugins': + Fraction of ATAC fragments within + genomic blacklist regions per cell + density for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Fraction of ATAC fragments within genomic blacklist regions per cell density' + tab: "Raw" + Caption: "Blacklist regions fraction" raw_rna_atac_umi_corr_plot_png: type: File? outputSource: sc_multiome_filter/raw_rna_atac_umi_corr_plot_png - label: "UMI per cell correlation for RNA vs ATAC assays (not filtered)" + label: "RNA UMI vs ATAC UMI, raw" doc: | - UMI per cell correlation for RNA vs ATAC assays (not filtered). - PNG format - 'sd:visualPlugins': + RNA UMI per cell vs ATAC UMI + per cell for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'UMI per cell correlation for RNA vs ATAC assays' + tab: "Raw" + Caption: "RNA UMI vs ATAC UMI" raw_tss_atac_umi_corr_plot_png: type: File? outputSource: sc_multiome_filter/raw_tss_atac_umi_corr_plot_png - label: "TSS enrichment score vs UMI per cell correlation for ATAC assay (not filtered)" + label: "TSS enrichment vs ATAC UMI, raw" doc: | - TSS enrichment score vs UMI per cell correlation for ATAC assay (not filtered). - PNG format - 'sd:visualPlugins': + TSS enrichment score vs ATAC UMI + per cell for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'TSS enrichment score vs UMI per cell correlation for ATAC assay' + tab: "Raw" + Caption: "TSS enrichment vs ATAC UMI" raw_qc_mtrcs_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_qc_mtrcs_dnst_plot_png - label: "QC metrics per cell density (not filtered)" + label: "Main QC metrics, raw" doc: | - QC metrics per cell density (not filtered). - PNG format - 'sd:visualPlugins': + Main QC metrics per cell densities + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'QC metrics per cell density' + tab: "Raw" + Caption: "Main QC metrics" raw_rnadbl_plot_png: type: File? outputSource: sc_multiome_filter/raw_rnadbl_plot_png - label: "Percentage of RNA doublets per dataset (not filtered)" + label: "RNA doublets, raw" doc: | - Percentage of RNA doublets per dataset (not filtered). - PNG format - 'sd:visualPlugins': + Percentage of RNA doublets per + dataset for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Percentage of RNA doublets per dataset' + tab: "Raw" + Caption: "RNA doublets" raw_atacdbl_plot_png: type: File? outputSource: sc_multiome_filter/raw_atacdbl_plot_png - label: "Percentage of ATAC doublets per dataset (not filtered)" + label: "ATAC doublets, raw" doc: | - Percentage of ATAC doublets per dataset (not filtered). - PNG format - 'sd:visualPlugins': + Percentage of ATAC doublets per + dataset for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Percentage of ATAC doublets per dataset' + tab: "Raw" + Caption: "ATAC doublets" raw_vrlpdbl_plot_png: type: File? outputSource: sc_multiome_filter/raw_vrlpdbl_plot_png - label: "Doublets overlap for RNA and ATAC assays per dataset (not filtered)" + label: "RNA and ATAC doublets overlap, raw" doc: | - Doublets overlap for RNA and ATAC assays per dataset (not filtered). - PNG format - 'sd:visualPlugins': + RNA and ATAC doublets overlap per + dataset for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Doublets overlap for RNA and ATAC assays per dataset' + tab: "Raw" + Caption: "RNA and ATAC doublets overlap" raw_tss_nrch_plot_png: type: File? outputSource: sc_multiome_filter/raw_tss_nrch_plot_png - label: "TSS enrichment score (not filtered)" + label: "TSS enrichment, raw" doc: | - TSS enrichment score (not filtered). - PNG format - 'sd:visualPlugins': + TSS enrichment score + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'TSS enrichment score' + tab: "Raw" + Caption: "TSS enrichment" raw_frgm_hist_png: type: File? outputSource: sc_multiome_filter/raw_frgm_hist_png - label: "Fragments length histogram (not filtered)" + label: "Fragments length, raw" doc: | - Fragments length histogram (not filtered). - PNG format - 'sd:visualPlugins': + Fragments length distribution + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Fragments length histogram' + tab: "Raw" + Caption: "Fragments length" raw_rna_umi_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/raw_rna_umi_dnst_spl_cnd_plot_png - label: "Split by grouping condition UMI per cell density for RNA assay (not filtered)" + label: "RNA UMI per cell, raw, split by condition" doc: | - Split by grouping condition UMI per cell density for RNA assay (not filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition RNA UMI + per cell for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Split by grouping condition UMI per cell density for RNA assay' + tab: "Raw, by condition" + Caption: "RNA UMI per cell" raw_gene_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/raw_gene_dnst_spl_cnd_plot_png - label: "Split by grouping condition genes per cell density (not filtered)" + label: "Genes per cell, raw, split by condition" doc: | - Split by grouping condition genes per cell density (not filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition genes + per cell for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Split by grouping condition genes per cell density' + tab: "Raw, by condition" + Caption: "Genes per cell" raw_mito_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/raw_mito_dnst_spl_cnd_plot_png - label: "Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density (not filtered)" + label: "Mitochondrial percentage, raw, split by condition" doc: | - Split by grouping condition the percentage of transcripts mapped - to mitochondrial genes per cell density (not filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition the + percentage of transcripts mapped to + mitochondrial genes per cell density + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density' + tab: "Raw, by condition" + Caption: "Mitochondrial percentage" raw_nvlt_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/raw_nvlt_dnst_spl_cnd_plot_png - label: "Split by grouping condition the novelty score per cell density for RNA assay (not filtered)" + label: "Novelty score, raw, split by condition" doc: | - Split by grouping condition the novelty score per cell density for RNA assay (not filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition the + novelty score per cell density + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Split by grouping condition the novelty score per cell density for RNA assay' + tab: "Raw, by condition" + Caption: "Novelty score" raw_atac_umi_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/raw_atac_umi_dnst_spl_cnd_plot_png - label: "Split by grouping condition UMI per cell density for ATAC assay (not filtered)" + label: "ATAC UMI per cell, raw, split by condition" doc: | - Split by grouping condition UMI per cell density for ATAC assay (not filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition ATAC + UMI per cell density for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Split by grouping condition UMI per cell density for ATAC assay' + tab: "Raw, by condition" + Caption: "ATAC UMI per cell" raw_peak_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/raw_peak_dnst_spl_cnd_plot_png - label: "Split by grouping condition peaks per cell density (not filtered)" + label: "Peaks per cell, raw, split by condition" doc: | - Split by grouping condition peaks per cell density (not filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition peaks + per cell for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Split by grouping condition peaks per cell density' + tab: "Raw, by condition" + Caption: "Peaks per cell" raw_blck_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/raw_blck_dnst_spl_cnd_plot_png - label: "Split by grouping condition the fraction of ATAC fragments within genomic blacklist regions per cell density (not filtered)" - doc: | - Split by grouping condition the fraction of ATAC fragments within genomic - blacklist regions per cell density (not filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Not filtered QC' - Caption: 'Split by grouping condition the fraction of ATAC fragments within genomic blacklist regions per cell density' - - mid_fltr_1_2_qc_mtrcs_pca_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_1_2_qc_mtrcs_pca_plot_png - label: "PC1 and PC2 from the QC metrics PCA (intermediate filtered)" - doc: | - PC1 and PC2 from the QC metrics PCA (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'PC1 and PC2 from the QC metrics PCA' - - mid_fltr_2_3_qc_mtrcs_pca_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_2_3_qc_mtrcs_pca_plot_png - label: "PC2 and PC3 from the QC metrics PCA (intermediate filtered)" - doc: | - PC2 and PC3 from the QC metrics PCA (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'PC2 and PC3 from the QC metrics PCA' - - mid_fltr_cells_count_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_cells_count_plot_png - label: "Number of cells per dataset (intermediate filtered)" - doc: | - Number of cells per dataset (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'Number of cells per dataset' - - mid_fltr_rna_umi_dnst_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_rna_umi_dnst_plot_png - label: "UMI per cell density for RNA assay (intermediate filtered)" - doc: | - UMI per cell density for RNA assay (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'UMI per cell density for RNA assay' - - mid_fltr_gene_dnst_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_gene_dnst_plot_png - label: "Genes per cell density (intermediate filtered)" - doc: | - Genes per cell density (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'Genes per cell density' - - mid_fltr_gene_umi_corr_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_gene_umi_corr_plot_png - label: "Genes vs UMI per cell correlation for RNA assay (intermediate filtered)" - doc: | - Genes vs UMI per cell correlation for RNA assay (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'Genes vs UMI per cell correlation for RNA assay' - - mid_fltr_mito_dnst_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_mito_dnst_plot_png - label: "Percentage of transcripts mapped to mitochondrial genes per cell density (intermediate filtered)" - doc: | - Percentage of transcripts mapped to mitochondrial genes per cell density (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'Percentage of transcripts mapped to mitochondrial genes per cell density' - - mid_fltr_nvlt_dnst_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_nvlt_dnst_plot_png - label: "Novelty score per cell density for RNA assay (intermediate filtered)" - doc: | - Novelty score per cell density for RNA assay (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'Novelty score per cell density for RNA assay' - - mid_fltr_atac_umi_dnst_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_atac_umi_dnst_plot_png - label: "UMI per cell density for ATAC assay (intermediate filtered)" - doc: | - UMI per cell density for ATAC assay (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'UMI per cell density for ATAC assay' - - mid_fltr_peak_dnst_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_peak_dnst_plot_png - label: "Peaks per cell density (intermediate filtered)" - doc: | - Peaks per cell density (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'Peaks per cell density' - - mid_fltr_blck_dnst_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_blck_dnst_plot_png - label: "Fraction of ATAC fragments within genomic blacklist regions per cell density (intermediate filtered)" + label: "Blacklist regions fraction, raw, split by condition" doc: | - Fraction of ATAC fragments within genomic blacklist regions per cell density (intermediate filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition the + fraction of ATAC fragments within + genomic blacklist regions per cell + density for raw data + "sd:visualPlugins": - image: - tab: 'Mid. filtered QC' - Caption: 'Fraction of ATAC fragments within genomic blacklist regions per cell density' - - mid_fltr_rna_atac_umi_corr_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_rna_atac_umi_corr_plot_png - label: "UMI per cell correlation for RNA vs ATAC assays (intermediate filtered)" - doc: | - UMI per cell correlation for RNA vs ATAC assays (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'UMI per cell correlation for RNA vs ATAC assays' - - mid_fltr_tss_atac_umi_corr_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_tss_atac_umi_corr_plot_png - label: "TSS enrichment score vs UMI per cell correlation for ATAC assay (intermediate filtered)" - doc: | - TSS enrichment score vs UMI per cell correlation for ATAC assay (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'TSS enrichment score vs UMI per cell correlation for ATAC assay' - - mid_fltr_qc_mtrcs_dnst_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_qc_mtrcs_dnst_plot_png - label: "QC metrics per cell density (intermediate filtered)" - doc: | - QC metrics per cell density (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'QC metrics per cell density' - - mid_fltr_rnadbl_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_rnadbl_plot_png - label: "Percentage of RNA doublets per dataset (intermediate filtered)" - doc: | - Percentage of RNA doublets per dataset (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'Percentage of RNA doublets per dataset' - - mid_fltr_atacdbl_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_atacdbl_plot_png - label: "Percentage of ATAC doublets per dataset (intermediate filtered)" - doc: | - Percentage of ATAC doublets per dataset (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'Percentage of ATAC doublets per dataset' - - mid_fltr_vrlpdbl_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_vrlpdbl_plot_png - label: "Doublets overlap for RNA and ATAC assays per dataset (intermediate filtered)" - doc: | - Doublets overlap for RNA and ATAC assays per dataset (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'Doublets overlap for RNA and ATAC assays per dataset' - - mid_fltr_tss_nrch_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_tss_nrch_plot_png - label: "TSS enrichment score (intermediate filtered)" - doc: | - TSS enrichment score (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'TSS enrichment score' - - mid_fltr_frgm_hist_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_frgm_hist_png - label: "Fragments length histogram (intermediate filtered)" - doc: | - Fragments length histogram (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'Fragments length histogram' - - mid_fltr_rna_umi_dnst_spl_cnd_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_rna_umi_dnst_spl_cnd_plot_png - label: "Split by grouping condition UMI per cell density for RNA assay (intermediate filtered)" - doc: | - Split by grouping condition UMI per cell density for RNA assay (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'Split by grouping condition UMI per cell density for RNA assay' - - mid_fltr_gene_dnst_spl_cnd_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_gene_dnst_spl_cnd_plot_png - label: "Split by grouping condition genes per cell density (intermediate filtered)" - doc: | - Split by grouping condition genes per cell density (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'Split by grouping condition genes per cell density' - - mid_fltr_mito_dnst_spl_cnd_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_mito_dnst_spl_cnd_plot_png - label: "Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density (intermediate filtered)" - doc: | - Split by grouping condition the percentage of transcripts mapped - to mitochondrial genes per cell density (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density' - - mid_fltr_nvlt_dnst_spl_cnd_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_nvlt_dnst_spl_cnd_plot_png - label: "Split by grouping condition the novelty score per cell density for RNA assay (intermediate filtered)" - doc: | - Split by grouping condition the novelty score per cell density for RNA assay (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'Split by grouping condition the novelty score per cell density for RNA assay' - - mid_fltr_atac_umi_dnst_spl_cnd_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_atac_umi_dnst_spl_cnd_plot_png - label: "Split by grouping condition UMI per cell density for ATAC assay (intermediate filtered)" - doc: | - Split by grouping condition UMI per cell density for ATAC assay (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'Split by grouping condition UMI per cell density for ATAC assay' - - mid_fltr_peak_dnst_spl_cnd_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_peak_dnst_spl_cnd_plot_png - label: "Split by grouping condition peaks per cell density (intermediate filtered)" - doc: | - Split by grouping condition peaks per cell density (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'Split by grouping condition peaks per cell density' - - mid_fltr_blck_dnst_spl_cnd_plot_png: - type: File? - outputSource: sc_multiome_filter/mid_fltr_blck_dnst_spl_cnd_plot_png - label: "Split by grouping condition the fraction of ATAC fragments within genomic blacklist regions per cell density (intermediate filtered)" - doc: | - Split by grouping condition the fraction of ATAC fragments within genomic - blacklist regions per cell density (intermediate filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Mid. filtered QC' - Caption: 'Split by grouping condition the fraction of ATAC fragments within genomic blacklist regions per cell density' + tab: "Raw, by condition" + Caption: "Blacklist regions fraction" fltr_1_2_qc_mtrcs_pca_plot_png: type: File? outputSource: sc_multiome_filter/fltr_1_2_qc_mtrcs_pca_plot_png - label: "PC1 and PC2 from the QC metrics PCA (filtered)" + label: "QC metrics PCA (1,2), filtered" doc: | - PC1 and PC2 from the QC metrics PCA (filtered). - PNG format - 'sd:visualPlugins': + PC1 and PC2 from the QC metrics + PCA for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'PC1 and PC2 from the QC metrics PCA' + tab: "Filtered" + Caption: "QC metrics PCA (1,2)" fltr_2_3_qc_mtrcs_pca_plot_png: type: File? outputSource: sc_multiome_filter/fltr_2_3_qc_mtrcs_pca_plot_png - label: "PC2 and PC3 from the QC metrics PCA (filtered)" + label: "QC metrics PCA (2,3), filtered" doc: | - PC2 and PC3 from the QC metrics PCA (filtered). - PNG format - 'sd:visualPlugins': + PC2 and PC3 from the QC metrics + PCA for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'PC2 and PC3 from the QC metrics PCA' + tab: "Filtered" + Caption: "QC metrics PCA (2,3)" fltr_cells_count_plot_png: type: File? outputSource: sc_multiome_filter/fltr_cells_count_plot_png - label: "Number of cells per dataset (filtered)" + label: "Cells per dataset, filtered" doc: | - Number of cells per dataset (filtered). - PNG format - 'sd:visualPlugins': + Number of cells per dataset + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Number of cells per dataset' + tab: "Filtered" + Caption: "Cells per dataset" fltr_rna_umi_dnst_plot_png: type: File? outputSource: sc_multiome_filter/fltr_rna_umi_dnst_plot_png - label: "UMI per cell density for RNA assay (filtered)" + label: "RNA UMI per cell, filtered" doc: | - UMI per cell density for RNA assay (filtered). - PNG format - 'sd:visualPlugins': + RNA UMI per cell density + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'UMI per cell density for RNA assay' + tab: "Filtered" + Caption: "RNA UMI per cell" fltr_gene_dnst_plot_png: type: File? outputSource: sc_multiome_filter/fltr_gene_dnst_plot_png - label: "Genes per cell density (filtered)" + label: "Genes per cell, filtered" doc: | - Genes per cell density (filtered). - PNG format - 'sd:visualPlugins': + Genes per cell density + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Genes per cell density' + tab: "Filtered" + Caption: "Genes per cell" fltr_gene_umi_corr_plot_png: type: File? outputSource: sc_multiome_filter/fltr_gene_umi_corr_plot_png - label: "Genes vs UMI per cell correlation for RNA assay (filtered)" + label: "Genes vs RNA UMI, filtered" doc: | - Genes vs UMI per cell correlation for RNA assay (filtered). - PNG format - 'sd:visualPlugins': + Genes vs RNA UMI per cell + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Genes vs UMI per cell correlation for RNA assay' + tab: "Filtered" + Caption: "Genes vs RNA UMI" fltr_mito_dnst_plot_png: type: File? outputSource: sc_multiome_filter/fltr_mito_dnst_plot_png - label: "Percentage of transcripts mapped to mitochondrial genes per cell density (filtered)" + label: "Mitochondrial percentage, filtered" doc: | - Percentage of transcripts mapped to mitochondrial genes per cell density (filtered). - PNG format - 'sd:visualPlugins': + Percentage of transcripts mapped to + mitochondrial genes per cell density + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Percentage of transcripts mapped to mitochondrial genes per cell density' + tab: "Filtered" + Caption: "Mitochondrial percentage" fltr_nvlt_dnst_plot_png: type: File? outputSource: sc_multiome_filter/fltr_nvlt_dnst_plot_png - label: "Novelty score per cell density for RNA assay (filtered)" + label: "Novelty score, filtered" doc: | - Novelty score per cell density for RNA assay (filtered). - PNG format - 'sd:visualPlugins': + Novelty score per cell density + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Novelty score per cell density for RNA assay' + tab: "Filtered" + Caption: "Novelty score" fltr_atac_umi_dnst_plot_png: type: File? outputSource: sc_multiome_filter/fltr_atac_umi_dnst_plot_png - label: "UMI per cell density for ATAC assay (filtered)" + label: "ATAC UMI per cell, filtered" doc: | - UMI per cell density for ATAC assay (filtered). - PNG format - 'sd:visualPlugins': + ATAC UMI per cell density + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'UMI per cell density for ATAC assay' + tab: "Filtered" + Caption: "ATAC UMI per cell" fltr_peak_dnst_plot_png: type: File? outputSource: sc_multiome_filter/fltr_peak_dnst_plot_png - label: "Peaks per cell density (filtered)" + label: "Peaks per cell, filtered" doc: | - Peaks per cell density (filtered). - PNG format - 'sd:visualPlugins': + Peaks per cell density + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Peaks per cell density' + tab: "Filtered" + Caption: "Peaks per cell" fltr_blck_dnst_plot_png: type: File? outputSource: sc_multiome_filter/fltr_blck_dnst_plot_png - label: "Fraction of ATAC fragments within genomic blacklist regions per cell density (filtered)" + label: "Blacklist regions fraction, filtered" doc: | - Fraction of ATAC fragments within genomic blacklist regions per cell density (filtered). - PNG format - 'sd:visualPlugins': + Fraction of ATAC fragments within + genomic blacklist regions per cell + density for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Fraction of ATAC fragments within genomic blacklist regions per cell density' + tab: "Filtered" + Caption: "Blacklist regions fraction" fltr_rna_atac_umi_corr_plot_png: type: File? outputSource: sc_multiome_filter/fltr_rna_atac_umi_corr_plot_png - label: "UMI per cell correlation for RNA vs ATAC assays (filtered)" + label: "RNA UMI vs ATAC UMI, filtered" doc: | - UMI per cell correlation for RNA vs ATAC assays (filtered). - PNG format - 'sd:visualPlugins': + RNA UMI per cell vs ATAC UMI + per cell for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'UMI per cell correlation for RNA vs ATAC assays' + tab: "Filtered" + Caption: "RNA UMI vs ATAC UMI" - fltr_rnadbl_plot_png: + fltr_tss_atac_umi_corr_plot_png: type: File? - outputSource: sc_multiome_filter/fltr_rnadbl_plot_png - label: "Percentage of RNA doublets per dataset (filtered)" + outputSource: sc_multiome_filter/fltr_tss_atac_umi_corr_plot_png + label: "TSS enrichment vs ATAC UMI, filtered" doc: | - Percentage of RNA doublets per dataset (filtered). - PNG format - 'sd:visualPlugins': + TSS enrichment score vs ATAC UMI + per cell for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Percentage of RNA doublets per dataset' + tab: "Filtered" + Caption: "TSS enrichment vs ATAC UMI" - fltr_atacdbl_plot_png: + fltr_qc_mtrcs_dnst_plot_png: type: File? - outputSource: sc_multiome_filter/fltr_atacdbl_plot_png - label: "Percentage of ATAC doublets per dataset (filtered)" + outputSource: sc_multiome_filter/fltr_qc_mtrcs_dnst_plot_png + label: "Main QC metrics, filtered" doc: | - Percentage of ATAC doublets per dataset (filtered). - PNG format - 'sd:visualPlugins': + Main QC metrics per cell densities + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Percentage of ATAC doublets per dataset' + tab: "Filtered" + Caption: "Main QC metrics" - fltr_vrlpdbl_plot_png: + fltr_rnadbl_plot_png: type: File? - outputSource: sc_multiome_filter/fltr_vrlpdbl_plot_png - label: "Doublets overlap for RNA and ATAC assays per dataset (filtered)" + outputSource: sc_multiome_filter/fltr_rnadbl_plot_png + label: "RNA doublets, filtered" doc: | - Doublets overlap for RNA and ATAC assays per dataset (filtered). - PNG format - 'sd:visualPlugins': + Percentage of RNA doublets per + dataset for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Doublets overlap for RNA and ATAC assays per dataset' + tab: "Filtered" + Caption: "RNA doublets" - fltr_tss_atac_umi_corr_plot_png: + fltr_atacdbl_plot_png: type: File? - outputSource: sc_multiome_filter/fltr_tss_atac_umi_corr_plot_png - label: "TSS enrichment score vs UMI per cell correlation for ATAC assay (filtered)" + outputSource: sc_multiome_filter/fltr_atacdbl_plot_png + label: "ATAC doublets, filtered" doc: | - TSS enrichment score vs UMI per cell correlation for ATAC assay (filtered). - PNG format - 'sd:visualPlugins': + Percentage of ATAC doublets per + dataset for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'TSS enrichment score vs UMI per cell correlation for ATAC assay' + tab: "Filtered" + Caption: "ATAC doublets" - fltr_qc_mtrcs_dnst_plot_png: + fltr_vrlpdbl_plot_png: type: File? - outputSource: sc_multiome_filter/fltr_qc_mtrcs_dnst_plot_png - label: "QC metrics per cell density (filtered)" + outputSource: sc_multiome_filter/fltr_vrlpdbl_plot_png + label: "RNA and ATAC doublets overlap, filtered" doc: | - QC metrics per cell density (filtered). - PNG format - 'sd:visualPlugins': + RNA and ATAC doublets overlap per + dataset for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'QC metrics per cell density' + tab: "Filtered" + Caption: "RNA and ATAC doublets overlap" fltr_tss_nrch_plot_png: type: File? outputSource: sc_multiome_filter/fltr_tss_nrch_plot_png - label: "TSS enrichment score (filtered)" + label: "TSS enrichment, filtered" doc: | - TSS enrichment score (filtered). - PNG format - 'sd:visualPlugins': + TSS enrichment score + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'TSS enrichment score' + tab: "Filtered" + Caption: "TSS enrichment" fltr_frgm_hist_png: type: File? outputSource: sc_multiome_filter/fltr_frgm_hist_png - label: "Fragments length histogram (filtered)" + label: "Fragments length, filtered" doc: | - Fragments length histogram (filtered). - PNG format - 'sd:visualPlugins': + Fragments length distribution + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Fragments length histogram' + tab: "Filtered" + Caption: "Fragments length" fltr_rna_umi_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/fltr_rna_umi_dnst_spl_cnd_plot_png - label: "Split by grouping condition UMI per cell density for RNA assay (filtered)" + label: "RNA UMI per cell, filtered, split by condition" doc: | - Split by grouping condition UMI per cell density for RNA assay (filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition RNA UMI + per cell for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Split by grouping condition UMI per cell density for RNA assay' + tab: "Filtered, by condition" + Caption: "RNA UMI per cell" fltr_gene_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/fltr_gene_dnst_spl_cnd_plot_png - label: "Split by grouping condition genes per cell density (filtered)" + label: "Genes per cell, filtered, split by condition" doc: | - Split by grouping condition genes per cell density (filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition genes + per cell for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Split by grouping condition genes per cell density' + tab: "Filtered, by condition" + Caption: "Genes per cell" fltr_mito_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/fltr_mito_dnst_spl_cnd_plot_png - label: "Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density (filtered)" + label: "Mitochondrial percentage, filtered, split by condition" doc: | - Split by grouping condition the percentage of transcripts mapped - to mitochondrial genes per cell density (filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition the + percentage of transcripts mapped to + mitochondrial genes per cell density + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density' + tab: "Filtered, by condition" + Caption: "Mitochondrial percentage" fltr_nvlt_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/fltr_nvlt_dnst_spl_cnd_plot_png - label: "Split by grouping condition the novelty score per cell density for RNA assay (filtered)" + label: "Novelty score, filtered, split by condition" doc: | - Split by grouping condition the novelty score per cell density for RNA assay (filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition the + novelty score per cell density + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Split by grouping condition the novelty score per cell density for RNA assay' + tab: "Filtered, by condition" + Caption: "Novelty score" fltr_atac_umi_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/fltr_atac_umi_dnst_spl_cnd_plot_png - label: "Split by grouping condition UMI per cell density for ATAC assay (filtered)" + label: "ATAC UMI per cell, filtered, split by condition" doc: | - Split by grouping condition UMI per cell density for ATAC assay (filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition ATAC + UMI per cell density for filtered + data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Split by grouping condition UMI per cell density for ATAC assay' + tab: "Filtered, by condition" + Caption: "ATAC UMI per cell" fltr_peak_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/fltr_peak_dnst_spl_cnd_plot_png - label: "Split by grouping condition peaks per cell density (filtered)" + label: "Peaks per cell, filtered, split by condition" doc: | - Split by grouping condition peaks per cell density (filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition peaks + per cell for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Split by grouping condition peaks per cell density' + tab: "Filtered, by condition" + Caption: "Peaks per cell" fltr_blck_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/fltr_blck_dnst_spl_cnd_plot_png - label: "Split by grouping condition the fraction of ATAC fragments within genomic blacklist regions per cell density (filtered)" + label: "Blacklist regions fraction, filtered, split by condition" doc: | - Split by grouping condition the fraction of ATAC fragments within genomic - blacklist regions per cell density (filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition the + fraction of ATAC fragments within + genomic blacklist regions per cell + density for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Split by grouping condition the fraction of ATAC fragments within genomic blacklist regions per cell density' - - ucsc_cb_config_data: - type: File - outputSource: compress_cellbrowser_config_data/compressed_folder - label: "Compressed directory with UCSC Cellbrowser configuration data" - doc: | - Compressed directory with UCSC Cellbrowser configuration data. + tab: "Filtered, by condition" + Caption: "Blacklist regions fraction" ucsc_cb_html_data: type: Directory outputSource: sc_multiome_filter/ucsc_cb_html_data - label: "Directory with UCSC Cellbrowser html data" + label: "UCSC Cell Browser data" doc: | - Directory with UCSC Cellbrowser html data. + Directory with UCSC Cell Browser + data ucsc_cb_html_file: type: File outputSource: sc_multiome_filter/ucsc_cb_html_file - label: "Open in UCSC Cell Browser" + label: "UCSC Cell Browser" doc: | - HTML index file from the directory with UCSC Cellbrowser html data. - 'sd:visualPlugins': + UCSC Cell Browser HTML index file + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" seurat_data_rds: type: File outputSource: sc_multiome_filter/seurat_data_rds - label: "Processed Seurat data in RDS format" + label: "Processed seurat data in RDS format" doc: | - Processed Seurat data in RDS format + Processed seurat data in RDS format sc_multiome_filter_stdout_log: type: File outputSource: sc_multiome_filter/stdout_log - label: "stdout log generated by sc_multiome_filter step" + label: "Output log, filtering step" doc: | stdout log generated by sc_multiome_filter step sc_multiome_filter_stderr_log: type: File outputSource: sc_multiome_filter/stderr_log - label: "stderr log generated by sc_multiome_filter step" + label: "Error log, filtering step" doc: | stderr log generated by sc_multiome_filter step @@ -1405,7 +1118,7 @@ steps: sc_multiome_filter: doc: | Filters single-cell multiome ATAC and RNA-Seq datasets - based on the common QC metrics + based on the multiple QC metrics run: ../tools/sc-multiome-filter.cwl in: feature_bc_matrices_folder: uncompress_feature_bc_matrices/extracted_folder @@ -1417,7 +1130,7 @@ steps: blacklist_regions_file: blacklist_regions_file barcodes_data: barcodes_data rna_minimum_cells: - default: 1 + default: 1 # will remove genes that are not expressed in any of the cells minimum_genes: source: minimum_genes valueFrom: $(split_numbers(self)) @@ -1433,7 +1146,7 @@ steps: source: minimum_novelty_score valueFrom: $(split_numbers(self)) atac_minimum_cells: - default: 1 + default: 1 # will remove peaks that are not present in any of the cells atac_minimum_umi: source: atac_minimum_umi valueFrom: $(split_numbers(self)) @@ -1449,33 +1162,31 @@ steps: maximum_blacklist_fraction: source: maximum_blacklist_fraction valueFrom: $(split_numbers(self)) - call_by: call_by remove_doublets: source: remove_doublets - valueFrom: $(self=="none"?null:self) - rna_doublet_rate: - source: rna_doublet_rate - valueFrom: $(self==""?null:self) # safety measure - rna_doublet_rate_sd: - source: rna_doublet_rate_sd - valueFrom: $(self==""?null:self) # safety measure - atac_doublet_rate: - source: atac_doublet_rate - valueFrom: $(self==""?null:self) # safety measure - atac_doublet_rate_sd: - source: atac_doublet_rate_sd - valueFrom: $(self==""?null:self) # safety measure + valueFrom: | + ${ + if (self == "Based on either RNA or ATAC") { + return "union"; + } else if (self == "Based on RNA") { + return "onlyrna"; + } else if (self == "Based on ATAC") { + return "onlyatac"; + } else if (self == "Based on both RNA and ATAC") { + return "intersect"; + } else { + return null; + } + } verbose: default: true export_ucsc_cb: default: true color_theme: color_theme parallel_memory_limit: - source: parallel_memory_limit - valueFrom: $(parseInt(self)) + default: 32 vector_memory_limit: - source: vector_memory_limit - valueFrom: $(parseInt(self)) + default: 96 threads: source: threads valueFrom: $(parseInt(self)) @@ -1506,32 +1217,6 @@ steps: - raw_atac_umi_dnst_spl_cnd_plot_png - raw_peak_dnst_spl_cnd_plot_png - raw_blck_dnst_spl_cnd_plot_png - - mid_fltr_1_2_qc_mtrcs_pca_plot_png - - mid_fltr_2_3_qc_mtrcs_pca_plot_png - - mid_fltr_cells_count_plot_png - - mid_fltr_rna_umi_dnst_plot_png - - mid_fltr_gene_dnst_plot_png - - mid_fltr_gene_umi_corr_plot_png - - mid_fltr_mito_dnst_plot_png - - mid_fltr_nvlt_dnst_plot_png - - mid_fltr_atac_umi_dnst_plot_png - - mid_fltr_peak_dnst_plot_png - - mid_fltr_blck_dnst_plot_png - - mid_fltr_rna_atac_umi_corr_plot_png - - mid_fltr_tss_atac_umi_corr_plot_png - - mid_fltr_qc_mtrcs_dnst_plot_png - - mid_fltr_rnadbl_plot_png - - mid_fltr_atacdbl_plot_png - - mid_fltr_vrlpdbl_plot_png - - mid_fltr_tss_nrch_plot_png - - mid_fltr_frgm_hist_png - - mid_fltr_rna_umi_dnst_spl_cnd_plot_png - - mid_fltr_gene_dnst_spl_cnd_plot_png - - mid_fltr_mito_dnst_spl_cnd_plot_png - - mid_fltr_nvlt_dnst_spl_cnd_plot_png - - mid_fltr_atac_umi_dnst_spl_cnd_plot_png - - mid_fltr_peak_dnst_spl_cnd_plot_png - - mid_fltr_blck_dnst_spl_cnd_plot_png - fltr_1_2_qc_mtrcs_pca_plot_png - fltr_2_3_qc_mtrcs_pca_plot_png - fltr_cells_count_plot_png @@ -1558,20 +1243,12 @@ steps: - fltr_atac_umi_dnst_spl_cnd_plot_png - fltr_peak_dnst_spl_cnd_plot_png - fltr_blck_dnst_spl_cnd_plot_png - - ucsc_cb_config_data - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds - stdout_log - stderr_log - compress_cellbrowser_config_data: - run: ../tools/tar-compress.cwl - in: - folder_to_compress: sc_multiome_filter/ucsc_cb_config_data - out: - - compressed_folder - $namespaces: s: http://schema.org/ @@ -1581,7 +1258,7 @@ $schemas: label: "Single-cell Multiome ATAC and RNA-Seq Filtering Analysis" s:name: "Single-cell Multiome ATAC and RNA-Seq Filtering Analysis" -s:alternateName: "Filters single-cell multiome ATAC and RNA-Seq datasets based on the common QC metrics" +s:alternateName: "Filters single-cell multiome ATAC and RNA-Seq datasets based on the multiple QC metrics" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-multiome-filter.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -1622,4 +1299,4 @@ doc: | Single-cell Multiome ATAC and RNA-Seq Filtering Analysis Filters single-cell multiome ATAC and RNA-Seq datasets - based on the common QC metrics. \ No newline at end of file + based on the multiple QC metrics. \ No newline at end of file From 8ef5591b27dad9dd6e848eeee457878514995134 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 8 Jun 2023 14:31:11 -0400 Subject: [PATCH 038/162] No reasons to save compressed Cell Browser data --- workflows/sc-atac-cluster.cwl | 14 -------------- workflows/sc-ctype-assign.cwl | 14 -------------- workflows/sc-rna-cluster.cwl | 14 -------------- workflows/sc-rna-da-cells.cwl | 14 -------------- workflows/sc-rna-filter.cwl | 14 -------------- workflows/sc-triangulate.cwl | 14 -------------- workflows/sc-wnn-cluster.cwl | 14 -------------- 7 files changed, 98 deletions(-) diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index cfa6ff67..bcdb4c24 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -357,13 +357,6 @@ outputs: tab: 'Diff. peaks' Title: 'Differentially accessible peaks between each pair of clusters' - ucsc_cb_config_data: - type: File - outputSource: compress_cellbrowser_config_data/compressed_folder - label: "Compressed directory with UCSC Cellbrowser configuration data" - doc: | - Compressed directory with UCSC Cellbrowser configuration data. - ucsc_cb_html_data: type: Directory outputSource: sc_atac_cluster/ucsc_cb_html_data @@ -459,13 +452,6 @@ steps: - stdout_log - stderr_log - compress_cellbrowser_config_data: - run: ../tools/tar-compress.cwl - in: - folder_to_compress: sc_atac_cluster/ucsc_cb_config_data - out: - - compressed_folder - $namespaces: s: http://schema.org/ diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index cc45dee6..b1b60224 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -636,13 +636,6 @@ outputs: tab: 'Diff. peaks' Title: 'Differentially accessible peaks between each pair of cell types' - ucsc_cb_config_data: - type: File - outputSource: compress_cellbrowser_config_data/compressed_folder - label: "Compressed directory with UCSC Cellbrowser configuration data" - doc: | - Compressed directory with UCSC Cellbrowser configuration data. - ucsc_cb_html_data: type: Directory outputSource: ctype_assign/ucsc_cb_html_data @@ -764,13 +757,6 @@ steps: - stdout_log - stderr_log - compress_cellbrowser_config_data: - run: ../tools/tar-compress.cwl - in: - folder_to_compress: ctype_assign/ucsc_cb_config_data - out: - - compressed_folder - $namespaces: s: http://schema.org/ diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index 34867ada..8f740c7e 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -446,13 +446,6 @@ outputs: tab: 'Gene markers' Title: 'Differentially expressed genes between each pair of clusters' - ucsc_cb_config_data: - type: File - outputSource: compress_cellbrowser_config_data/compressed_folder - label: "Compressed directory with UCSC Cellbrowser configuration data" - doc: | - Compressed directory with UCSC Cellbrowser configuration data. - ucsc_cb_html_data: type: Directory outputSource: sc_rna_cluster/ucsc_cb_html_data @@ -565,13 +558,6 @@ steps: - stdout_log - stderr_log - compress_cellbrowser_config_data: - run: ../tools/tar-compress.cwl - in: - folder_to_compress: sc_rna_cluster/ucsc_cb_config_data - out: - - compressed_folder - $namespaces: s: http://schema.org/ diff --git a/workflows/sc-rna-da-cells.cwl b/workflows/sc-rna-da-cells.cwl index 753bef07..9befb120 100644 --- a/workflows/sc-rna-da-cells.cwl +++ b/workflows/sc-rna-da-cells.cwl @@ -317,13 +317,6 @@ outputs: tab: 'Per dataset' Caption: 'Split by dataset cells WNN UMAP with DA scores' - ucsc_cb_config_data: - type: File - outputSource: compress_cellbrowser_config_data/compressed_folder - label: "Compressed directory with UCSC Cellbrowser configuration data" - doc: | - Compressed directory with UCSC Cellbrowser configuration data. - ucsc_cb_html_data: type: Directory outputSource: da_cells/ucsc_cb_html_data @@ -413,13 +406,6 @@ steps: - stdout_log - stderr_log - compress_cellbrowser_config_data: - run: ../tools/tar-compress.cwl - in: - folder_to_compress: da_cells/ucsc_cb_config_data - out: - - compressed_folder - $namespaces: s: http://schema.org/ diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index b0a94dcf..682b5c02 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -581,13 +581,6 @@ outputs: tab: 'Filtered QC' Caption: 'Split by grouping condition the novelty score per cell density' - ucsc_cb_config_data: - type: File - outputSource: compress_cellbrowser_config_data/compressed_folder - label: "Compressed directory with UCSC Cellbrowser configuration data" - doc: | - Compressed directory with UCSC Cellbrowser configuration data. - ucsc_cb_html_data: type: Directory outputSource: sc_rna_filter/ucsc_cb_html_data @@ -721,13 +714,6 @@ steps: - stdout_log - stderr_log - compress_cellbrowser_config_data: - run: ../tools/tar-compress.cwl - in: - folder_to_compress: sc_rna_filter/ucsc_cb_config_data - out: - - compressed_folder - $namespaces: s: http://schema.org/ diff --git a/workflows/sc-triangulate.cwl b/workflows/sc-triangulate.cwl index 21b93b73..38446f97 100644 --- a/workflows/sc-triangulate.cwl +++ b/workflows/sc-triangulate.cwl @@ -256,13 +256,6 @@ outputs: tab: 'WNN' Caption: 'Cells UMAP with winning annotations' - ucsc_cb_config_data: - type: File - outputSource: compress_cellbrowser_config_data/compressed_folder - label: "Compressed directory with UCSC Cellbrowser configuration data" - doc: | - Compressed directory with UCSC Cellbrowser configuration data. - ucsc_cb_html_data: type: Directory outputSource: triangulate/ucsc_cb_html_data @@ -344,13 +337,6 @@ steps: - stdout_log - stderr_log - compress_cellbrowser_config_data: - run: ../tools/tar-compress.cwl - in: - folder_to_compress: triangulate/ucsc_cb_config_data - out: - - compressed_folder - $namespaces: s: http://schema.org/ diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index db5ffb17..1612bde6 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -589,13 +589,6 @@ outputs: tab: 'Diff. peaks' Title: 'Differentially accessible peaks between each pair of clusters' - ucsc_cb_config_data: - type: File - outputSource: compress_cellbrowser_config_data/compressed_folder - label: "Compressed directory with UCSC Cellbrowser configuration data" - doc: | - Compressed directory with UCSC Cellbrowser configuration data. - ucsc_cb_html_data: type: Directory outputSource: sc_wnn_cluster/ucsc_cb_html_data @@ -722,13 +715,6 @@ steps: - stdout_log - stderr_log - compress_cellbrowser_config_data: - run: ../tools/tar-compress.cwl - in: - folder_to_compress: sc_wnn_cluster/ucsc_cb_config_data - out: - - compressed_folder - $namespaces: s: http://schema.org/ From c09be239ac5523c07781050997978de74419ab22 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 28 Jun 2023 17:15:16 -0400 Subject: [PATCH 039/162] Refactor cellranger arc count pipeline --- workflows/cellranger-arc-count.cwl | 394 +++++++++++++++++------------ 1 file changed, 236 insertions(+), 158 deletions(-) diff --git a/workflows/cellranger-arc-count.cwl b/workflows/cellranger-arc-count.cwl index c75357cf..25d7e949 100644 --- a/workflows/cellranger-arc-count.cwl +++ b/workflows/cellranger-arc-count.cwl @@ -18,25 +18,36 @@ inputs: alias: type: string - label: "Experiment short name/Alias" + label: "Analysis name" sd:preview: position: 1 indices_folder: type: Directory - label: "Genome Type" - doc: "Cell Ranger ARC generated genome indices folder" + label: "Genome type" + doc: | + Reference genome package created + with cellranger-arc mkref command. 'sd:upstreamSource': "genome_indices/arc_indices_folder" 'sd:localLabel': true + memory_limit: + type: int + 'sd:upstreamSource': "genome_indices/memory_limit" + gex_fastq_file_r1: type: - File - type: array items: File format: "http://edamontology.org/format_1930" - label: "GEX FASTQ file(s) R1 (optionally compressed)" - doc: "GEX FASTQ file(s) R1 (optionally compressed)" + label: "RNA FASTQ, Read 1" + doc: | + Optionally compressed FASTQ file + with Read 1 (10x barcode and UMI) + single-cell RNA sequencing data. + If multiple files provided they + will be merged. gex_fastq_file_r2: type: @@ -44,8 +55,12 @@ inputs: - type: array items: File format: "http://edamontology.org/format_1930" - label: "GEX FASTQ file(s) R2 (optionally compressed)" - doc: "GEX FASTQ file(s) R2 (optionally compressed)" + label: "RNA FASTQ, Read 2" + doc: | + Optionally compressed FASTQ file + with Read 2 (cDNA insert) single-cell + RNA sequencing data. If multiple + files provided they will be merged. atac_fastq_file_r1: type: @@ -53,8 +68,13 @@ inputs: - type: array items: File format: "http://edamontology.org/format_1930" - label: "ATAC FASTQ file(s) R1 (optionally compressed)" - doc: "ATAC FASTQ file(s) R1 (optionally compressed)" + label: "ATAC FASTQ, Read 1" + doc: | + Optionally compressed FASTQ file + with Read 1 (transposed DNA) + single-cell ATAC sequencing data. + If multiple files provided they + will be merged. atac_fastq_file_r2: type: @@ -62,8 +82,13 @@ inputs: - type: array items: File format: "http://edamontology.org/format_1930" - label: "ATAC FASTQ file(s) R2 (optionally compressed)" - doc: "ATAC FASTQ file(s) R2 (optionally compressed)" + label: "ATAC FASTQ, Read 2" + doc: | + Optionally compressed FASTQ file + with Read 2 (10x barcode) + single-cell ATAC sequencing data. + If multiple files provided they + will be merged. atac_fastq_file_r3: type: @@ -71,48 +96,80 @@ inputs: - type: array items: File format: "http://edamontology.org/format_1930" - label: "ATAC FASTQ file(s) R3 (optionally compressed)" - doc: "ATAC FASTQ file(s) R3 (optionally compressed)" + label: "ATAC FASTQ, Read 3" + doc: | + Optionally compressed FASTQ file + with Read 3 (transposed DNA) + single-cell ATAC sequencing data. + If multiple files provided they + will be merged. exclude_introns: type: boolean? default: false - label: "Disable counting of intronic reads" + label: "Do not count intronic reads" doc: | - Disable counting of intronic reads. In this mode, only reads that are exonic - and compatible with annotated splice junctions in the reference are counted. - Note: using this mode will reduce the UMI counts in the feature-barcode matrix + Exclude intronic reads when counting + gene expression. In this mode, only + reads that are exonic and compatible + with annotated splice junctions in + the reference are counted. Using this + mode will reduce the UMI counts and + decrease sensitivity. 'sd:layout': advanced: true threads: - type: int? - default: 4 - label: "Number of threads" - doc: "Number of threads for those steps that support multithreading" + type: + - "null" + - type: enum + symbols: + - "1" + - "2" + - "3" + - "4" + default: "4" + label: "Cores/CPUs number" + doc: | + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 4 'sd:layout': advanced: true - memory_limit: - type: int? - default: 20 - label: "Genome Type" - doc: | - Maximum memory used (GB). - The same as was used for generating indices. - The same will be applied to virtual memory - 'sd:upstreamSource': "genome_indices/memory_limit" - 'sd:localLabel': true - outputs: + web_summary_report: + type: File + outputSource: generate_counts_matrix/web_summary_report + label: "Cell Ranger Summary" + doc: | + Report generated by Cell Ranger + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + cellbrowser_report: + type: File + outputSource: cellbrowser_build/index_html_file + label: "UCSC Cell Browser" + doc: | + UCSC Cell Browser HTML index file + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + fastqc_report_gex_fastq_r1: type: File outputSource: run_fastqc_for_gex_fastq_r1/html_file - label: "FastqQC report for GEX FASTQ file R1" + label: "QC report (RNA FASTQ, Read 1)" doc: | - FastqQC report for GEX FASTQ file R1 + FastqQC report generated for + RNA FASTQ file, Read 1 'sd:visualPlugins': - linkList: tab: 'Overview' @@ -121,9 +178,10 @@ outputs: fastqc_report_gex_fastq_r2: type: File outputSource: run_fastqc_for_gex_fastq_r2/html_file - label: "FastqQC report for GEX FASTQ file R2" + label: "QC report (RNA FASTQ, Read 2)" doc: | - FastqQC report for GEX FASTQ file R2 + FastqQC report generated for + RNA FASTQ file, Read 2 'sd:visualPlugins': - linkList: tab: 'Overview' @@ -132,9 +190,10 @@ outputs: fastqc_report_atac_fastq_r1: type: File outputSource: run_fastqc_for_atac_fastq_r1/html_file - label: "FastqQC report for ATAC FASTQ file R1" + label: "QC report (ATAC FASTQ, Read 1)" doc: | - FastqQC report for ATAC FASTQ file R1 + FastqQC report generated for + ATAC FASTQ file, Read 1 'sd:visualPlugins': - linkList: tab: 'Overview' @@ -143,9 +202,10 @@ outputs: fastqc_report_atac_fastq_r2: type: File outputSource: run_fastqc_for_atac_fastq_r2/html_file - label: "FastqQC report for ATAC FASTQ file R2" + label: "QC report (ATAC FASTQ, Read 2)" doc: | - FastqQC report for ATAC FASTQ file R2 + FastqQC report generated for + ATAC FASTQ file, Read 2 'sd:visualPlugins': - linkList: tab: 'Overview' @@ -154,20 +214,10 @@ outputs: fastqc_report_atac_fastq_r3: type: File outputSource: run_fastqc_for_atac_fastq_r3/html_file - label: "FastqQC report for ATAC FASTQ file R3" + label: "QC report (ATAC FASTQ, Read 3)" doc: | - FastqQC report for ATAC FASTQ file R3 - 'sd:visualPlugins': - - linkList: - tab: 'Overview' - target: "_blank" - - web_summary_report: - type: File - outputSource: generate_counts_matrix/web_summary_report - label: "Cell Ranger summary" - doc: | - Cell Ranger summary + FastqQC report generated for + ATAC FASTQ file, Read 3 'sd:visualPlugins': - linkList: tab: 'Overview' @@ -176,126 +226,162 @@ outputs: metrics_summary_report: type: File outputSource: generate_counts_matrix/metrics_summary_report - label: "Run summary metrics in CSV format" + label: "Run summary metrics" doc: | - Run summary metrics in CSV format + Cell Ranger generated run summary + metrics in CSV format barcode_metrics_report: type: File outputSource: generate_counts_matrix/barcode_metrics_report - label: "ATAC and GEX barcode metrics in CSV format" + label: "ATAC and RNA barcode metrics" doc: | - ATAC and GEX read count summaries generated for every - barcode observed in the experiment. The columns contain - the paired ATAC and Gene Expression barcode sequences, - ATAC and Gene Expression QC metrics for that barcode, - as well as whether this barcode was identified as a - cell-associated partition by the pipeline. + ATAC and RNA read count summaries + generated for every barcode observed + in the experiment. The columns contain + the paired ATAC and Gene Expression + barcode sequences, ATAC and Gene + Expression QC metrics for that barcode, + as well as whether this barcode was + identified as a cell-associated + partition by the pipeline. gex_possorted_genome_bam_bai: type: File outputSource: generate_counts_matrix/gex_possorted_genome_bam_bai - label: "Aligned to the genome indexed reads GEX BAM+BAI files" + label: "RNA position-sorted alignments" doc: | - GEX position-sorted reads aligned to the genome and transcriptome - annotated with barcode information in BAM format + Position-sorted and indexed BAM file + of RNA read alignments to the genome + and transcriptome. Each read in this + BAM file has a 10x Chromium cellular + (associated with a 10x Genomics gel + bead) barcode and molecular barcode + information attached. atac_possorted_genome_bam_bai: type: File outputSource: generate_counts_matrix/atac_possorted_genome_bam_bai - label: "Aligned to the genome indexed reads ATAC BAM+BAI files" + label: "ATAC position-sorted alignments" doc: | - ATAC position-sorted reads aligned to the genome annotated with - barcode information in BAM format + Position-sorted and indexed BAM file + for the Chromatin Accessibility + library. Chromium cellular barcode + and mapping information for each read + is stored as TAG fields. filtered_feature_bc_matrix_folder: type: File outputSource: compress_filtered_feature_bc_matrix_folder/compressed_folder - label: "Compressed folder with filtered feature-barcode matrices" + label: "Filtered feature barcode matrix, MEX" doc: | - Filtered feature barcode matrix stored as a CSC sparse matrix in MEX format. - The rows consist of all the gene and peak features concatenated together - (identical to raw feature barcode matrix) and the columns are restricted to - those barcodes that are identified as cells. + Filtered feature barcode matrix stored + as a CSC sparse matrix in MEX format. + The rows consist of all the gene and + peak features concatenated together + (identical to raw feature barcode + matrix) and the columns are restricted + to those barcodes that are identified + as cells. filtered_feature_bc_matrix_h5: type: File outputSource: generate_counts_matrix/filtered_feature_bc_matrix_h5 - label: "Filtered feature-barcode matrices in HDF5 format" + label: "Filtered feature barcode matrix, HDF5" doc: | - Filtered feature barcode matrix stored as a CSC sparse matrix in hdf5 format. - The rows consist of all the gene and peak features concatenated together - (identical to raw feature barcode matrix) and the columns are restricted to - those barcodes that are identified as cells. + Filtered feature barcode matrix stored + as a CSC sparse matrix in hdf5 format. + The rows consist of all the gene and + peak features concatenated together + (identical to raw feature barcode + matrix) and the columns are restricted + to those barcodes that are identified + as cells. raw_feature_bc_matrices_folder: type: File outputSource: compress_raw_feature_bc_matrices_folder/compressed_folder - label: "Compressed folder with unfiltered feature-barcode matrices" + label: "Raw feature barcode matrix, MEX" doc: | - Raw feature barcode matrix stored as a CSC sparse matrix in MEX format. - The rows consist of all the gene and peak features concatenated together - and the columns consist of all observed barcodes with non-zero signal for + Raw feature barcode matrix stored as + a CSC sparse matrix in MEX format. + The rows consist of all the gene and + peak features concatenated together + and the columns consist of all observed + barcodes with non-zero signal for either ATAC or gene expression. raw_feature_bc_matrices_h5: type: File outputSource: generate_counts_matrix/raw_feature_bc_matrices_h5 - label: "Unfiltered feature-barcode matrices in HDF5 format" + label: "Raw feature barcode matrix, HDF5" doc: | - Raw feature barcode matrix stored as a CSC sparse matrix in hdf5 format. - The rows consist of all the gene and peak features concatenated together - and the columns consist of all observed barcodes with non-zero signal for + Raw feature barcode matrix stored as + a CSC sparse matrix in hdf5 format. + The rows consist of all the gene and + peak features concatenated together + and the columns consist of all observed + barcodes with non-zero signal for either ATAC or gene expression. secondary_analysis_report_folder: type: File outputSource: compress_secondary_analysis_report_folder/compressed_folder - label: "Compressed folder with secondary analysis results" + label: "Secondary analysis" doc: | - Various secondary analyses that utilize the ATAC data, the GEX data, and their - linkage: dimensionality reduction and clustering results for the ATAC and GEX - data, differential expression, and differential accessibility for all clustering - results above and linkage between ATAC and GEX data. + Various secondary analyses that + utilize the ATAC, RNA data, and + their linkage: dimensionality + reduction and clustering results + for the ATAC and RNA data, + differential expression, and + differential accessibility for all + clustering results above and linkage + between ATAC and RNA data. gex_molecule_info_h5: type: File outputSource: generate_counts_matrix/gex_molecule_info_h5 - label: "GEX molecule-level information for aggregating samples into larger datasets" + label: "RNA molecule-level data" doc: | - Count and barcode information for every GEX molecule observed in the experiment - in hdf5 format + Count and barcode information for + every RNA molecule observed in the + experiment in hdf5 format loupe_browser_track: type: File outputSource: generate_counts_matrix/loupe_browser_track - label: "Loupe Browser visualization file with all the analysis outputs" + label: "Loupe Browser visualization" doc: | - Loupe Browser visualization file with all the analysis outputs + Loupe Browser visualization file + with all the analysis outputs atac_fragments_file: type: File outputSource: generate_counts_matrix/atac_fragments_file - label: "Count and barcode information for every ATAC fragment in TSV format" + label: "ATAC fragments" doc: | - Count and barcode information for every ATAC fragment observed in + Count and barcode information for + every ATAC fragment observed in the experiment in TSV format. atac_peaks_bed_file: type: File outputSource: generate_counts_matrix/atac_peaks_bed_file - label: "Identified peaks in BED format" + label: "ATAC peaks" doc: | - Locations of open-chromatin regions identified in this sample. - These regions are referred to as "peaks". + Locations of open-chromatin regions + identified in this sample. These + regions are referred to as "peaks". atac_cut_sites_bigwig_file: type: File outputSource: generate_counts_matrix/atac_cut_sites_bigwig_file - label: "Observed transposition sites in bigWig format" + label: "ATAC cut sites" doc: | - Genome track of observed transposition sites in the experiment - smoothed at a resolution of 400 bases in BIGWIG format. + Genome track of observed transposition + sites in the experiment smoothed at a + resolution of 400 bases in bigWig format. 'sd:visualPlugins': - igvbrowser: tab: 'IGV Genome Browser' @@ -307,37 +393,41 @@ outputs: atac_peak_annotation_file: type: File outputSource: generate_counts_matrix/atac_peak_annotation_file - label: "Annotations of peaks based on genomic proximity in TSV format" + label: "ATAC peaks annotations" doc: | - Annotations of peaks based on genomic proximity alone. - Note that these are not functional annotations and they - do not make use of linkage with GEX data. + Annotations of peaks based on + genomic proximity alone. Note, + that these are not functional + annotations and they do not make + use of linkage with RNA data. generate_counts_matrix_stdout_log: type: File outputSource: generate_counts_matrix/stdout_log - label: stdout log generated by cellranger-arc count + label: "Output log, cellranger-arc count step" doc: | stdout log generated by cellranger-arc count generate_counts_matrix_stderr_log: type: File outputSource: generate_counts_matrix/stderr_log - label: stderr log generated by cellranger-arc count + label: "Error log, cellranger-arc count step" doc: | stderr log generated by cellranger-arc count collected_statistics_yaml: type: File outputSource: collect_statistics/collected_statistics_yaml - label: "Collected statistics in YAML format" - doc: "Collected statistics in YAML format" + label: "Collected statistics, YAML" + doc: | + Collected statistics in YAML format collected_statistics_md: type: File outputSource: collect_statistics/collected_statistics_md - label: "Collected statistics in Markdown format" - doc: "Collected statistics in Markdown format" + label: "Collected statistics" + doc: | + Collected statistics in Markdown format 'sd:visualPlugins': - markdownView: tab: 'Overview' @@ -345,37 +435,21 @@ outputs: collected_statistics_tsv: type: File outputSource: collect_statistics/collected_statistics_tsv - label: "Collected statistics in TSV format" - doc: "Collected statistics in TSV format" + label: "Collected statistics" + doc: | + Collected statistics in TSV format 'sd:visualPlugins': - tableView: vertical: true tab: 'Overview' - compressed_html_data_folder: - type: File - outputSource: compress_html_data_folder/compressed_folder - label: "Compressed folder with CellBrowser formatted results" - doc: | - Compressed folder with CellBrowser formatted results - html_data_folder: type: Directory outputSource: cellbrowser_build/html_data - label: "Folder with not compressed CellBrowser formatted results" + label: "UCSC Cell Browser data" doc: | - Folder with not compressed CellBrowser formatted results - - cellbrowser_report: - type: File - outputSource: cellbrowser_build/index_html_file - label: "CellBrowser formatted Cellranger report" - doc: | - CellBrowser formatted Cellranger report - 'sd:visualPlugins': - - linkList: - tab: 'Overview' - target: "_blank" + Directory with UCSC Cell Browser + data steps: @@ -385,7 +459,7 @@ steps: in: compressed_file: gex_fastq_file_r1 output_prefix: - default: "gex_read_1" + default: "rna_read_1" out: - fastq_file @@ -394,7 +468,7 @@ steps: in: compressed_file: gex_fastq_file_r2 output_prefix: - default: "gex_read_2" + default: "rna_read_2" out: - fastq_file @@ -425,12 +499,13 @@ steps: out: - fastq_file - run_fastqc_for_gex_fastq_r1: run: ../tools/fastqc.cwl in: reads_file: extract_gex_fastq_r1/fastq_file - threads: threads + threads: + source: threads + valueFrom: $(parseInt(self)) out: - html_file @@ -438,7 +513,9 @@ steps: run: ../tools/fastqc.cwl in: reads_file: extract_gex_fastq_r2/fastq_file - threads: threads + threads: + source: threads + valueFrom: $(parseInt(self)) out: - html_file @@ -446,7 +523,9 @@ steps: run: ../tools/fastqc.cwl in: reads_file: extract_atac_fastq_r1/fastq_file - threads: threads + threads: + source: threads + valueFrom: $(parseInt(self)) out: - html_file @@ -454,7 +533,9 @@ steps: run: ../tools/fastqc.cwl in: reads_file: extract_atac_fastq_r2/fastq_file - threads: threads + threads: + source: threads + valueFrom: $(parseInt(self)) out: - html_file @@ -462,11 +543,12 @@ steps: run: ../tools/fastqc.cwl in: reads_file: extract_atac_fastq_r3/fastq_file - threads: threads + threads: + source: threads + valueFrom: $(parseInt(self)) out: - html_file - generate_counts_matrix: run: ../tools/cellranger-arc-count.cwl in: @@ -477,7 +559,9 @@ steps: atac_fastq_file_r3: extract_atac_fastq_r3/fastq_file indices_folder: indices_folder exclude_introns: exclude_introns - threads: threads + threads: + source: threads + valueFrom: $(parseInt(self)) memory_limit: memory_limit virt_memory_limit: memory_limit out: @@ -539,13 +623,6 @@ steps: - html_data - index_html_file - compress_html_data_folder: - run: ../tools/tar-compress.cwl - in: - folder_to_compress: cellbrowser_build/html_data - out: - - compressed_folder - $namespaces: s: http://schema.org/ @@ -553,9 +630,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -s:name: "Cell Ranger ARC Count Gene Expression + ATAC" -label: "Cell Ranger ARC Count Gene Expression + ATAC" -s:alternateName: "Counts ATAC and gene expression reads from a single 10x Genomics Cell Ranger Multiome ATAC + Gene Expression library" +s:name: "Cell Ranger ARC Count Gene Expression and Chromatin Accessibility" +label: "Cell Ranger ARC Count Gene Expression and Chromatin Accessibility" +s:alternateName: "Counts gene expression and chromatin accessibility for a single library" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/cellranger-arc-count.cwl s:codeRepository: https://github.com/datirium/workflows @@ -593,5 +670,6 @@ s:creator: doc: | - Cell Ranger ARC Count Gene Expression + ATAC - ============================================ \ No newline at end of file + Cell Ranger ARC Count Gene Expression and Chromatin Accessibility + + Counts gene expression and chromatin accessibility for a single library \ No newline at end of file From 80b7ff27c58cc1f8cae70dc62c5e622afe06f3d4 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 28 Jun 2023 18:26:36 -0400 Subject: [PATCH 040/162] Refactor cellranger arc aggr workflow --- workflows/cellranger-arc-aggr.cwl | 272 ++++++++++++++++++------------ 1 file changed, 161 insertions(+), 111 deletions(-) diff --git a/workflows/cellranger-arc-aggr.cwl b/workflows/cellranger-arc-aggr.cwl index 86eccdcb..2daaf442 100644 --- a/workflows/cellranger-arc-aggr.cwl +++ b/workflows/cellranger-arc-aggr.cwl @@ -20,68 +20,87 @@ inputs: alias: type: string - label: "Experiment short name/Alias" + label: "Analysis name" sd:preview: position: 1 - gem_well_labels: - type: string[] - label: "scRNA-Seq Cell Ranger ARC Experiment" - doc: "Array of GEM well identifiers to be used for labeling purposes only" - 'sd:upstreamSource': "sc_rnaseq_sample/alias" - 'sd:localLabel': true - gex_molecule_info_h5: type: File[] - label: "scRNA-Seq Cell Ranger ARC Experiment" - doc: "Molecule-level information from individual runs of cellranger-arc count" + label: "Cell Ranger ARC Sample" + doc: | + Any "Cell Ranger ARC Sample" that + produces RNA molecule-level data, + ATAC fragments, and ATAC and RNA + barcode metrics files. 'sd:upstreamSource': "sc_rnaseq_sample/gex_molecule_info_h5" + 'sd:localLabel': true + + gem_well_labels: + type: string[] + 'sd:upstreamSource': "sc_rnaseq_sample/alias" atac_fragments_file_from_count: type: File[] secondaryFiles: - .tbi - label: "scRNA-Seq Cell Ranger ARC Experiment" - doc: "Count and barcode information from individual runs of cellranger-arc count" 'sd:upstreamSource': "sc_rnaseq_sample/atac_fragments_file" barcode_metrics_report: type: File[] - label: "scRNA-Seq Cell Ranger ARC Experiment" - doc: "ATAC and GEX barcode metrics from individual runs of cellranger-arc count" 'sd:upstreamSource': "sc_rnaseq_sample/barcode_metrics_report" indices_folder: type: Directory - label: "Genome Type" - doc: "Cell Ranger ARC generated genome indices folder" + label: "Genome type" + doc: | + Reference genome package created + with cellranger-arc mkref command. 'sd:upstreamSource': "genome_indices/arc_indices_folder" 'sd:localLabel': true + memory_limit: + type: int + 'sd:upstreamSource': "genome_indices/memory_limit" + normalization_mode: type: - "null" - type: enum - symbols: ["none", "depth"] + symbols: + - "none" + - "depth" default: "none" - label: "Library depth normalization mode" - doc: "Library depth normalization mode" + label: "Library depth normalization" + doc: | + When "depth" normalization is + selected, subsample reads from + higher-depth GEM wells until we + equalize the 1) median number + of unique fragments per cell for + each ATAC library, 2) mean number + of reads that are confidently + mapped to the transcriptome per + cell for each gene expression + library. 'sd:layout': advanced: true threads: - type: int? - default: 4 - label: "Number of threads" - doc: "Number of threads for those steps that support multithreading" - 'sd:layout': - advanced: true - - memory_limit: - type: int? - default: 20 - label: "Maximum memory used (GB)" - doc: "Maximum memory used (GB). The same will be applied to virtual memory" + type: + - "null" + - type: enum + symbols: + - "1" + - "2" + - "3" + - "4" + default: "4" + label: "Cores/CPUs number" + doc: | + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 4 'sd:layout': advanced: true @@ -91,134 +110,168 @@ outputs: web_summary_report: type: File outputSource: aggregate_counts/web_summary_report - label: "Aggregated run summary metrics and charts in HTML format" + label: "Cell Ranger Summary" doc: | - Aggregated run summary metrics and charts in HTML format + Report generated by Cell Ranger 'sd:visualPlugins': - linkList: tab: 'Overview' target: "_blank" - metrics_summary_report: - type: File - outputSource: aggregate_counts/metrics_summary_report - label: "Aggregated run summary metrics in CSV format" - doc: | - Aggregated run summary metrics in CSV format - - atac_fragments_file: - type: File - outputSource: aggregate_counts/atac_fragments_file - label: "Aggregated count and barcode information" - doc: | - Count and barcode information for every ATAC fragment observed in the - aggregated experiment in TSV format - - atac_peaks_bed_file: + cellbrowser_report: type: File - outputSource: aggregate_counts/atac_peaks_bed_file - label: "Locations of open-chromatin regions identified in aggregated experiment" + outputSource: cellbrowser_build/index_html_file + label: "UCSC Cell Browser" doc: | - Locations of open-chromatin regions identified in aggregated experiment - (these regions are referred to as "peaks") + UCSC Cell Browser HTML index file + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" - atac_peak_annotation_file: + metrics_summary_report: type: File - outputSource: aggregate_counts/atac_peak_annotation_file - label: "Annotations of peaks based on genomic proximity alone for aggregated experiment" + outputSource: aggregate_counts/metrics_summary_report + label: "Run summary metrics" doc: | - Annotations of peaks based on genomic proximity alone (for aggregated - experiment). Note that these are not functional annotations and they - do not make use of linkage with GEX data. + Cell Ranger generated run summary + metrics in CSV format - secondary_analysis_report_folder: + aggregation_metadata: type: File - outputSource: compress_secondary_analysis_report_folder/compressed_folder - label: "Compressed folder with aggregated secondary analysis results" + outputSource: aggregate_counts/aggregation_metadata + label: "Aggregation metadata" doc: | - Compressed folder with secondary analysis results including dimensionality reduction, - cell clustering, and differential expression of aggregated results + Aggregation metadata file + in CSV format filtered_feature_bc_matrix_folder: type: File outputSource: compress_filtered_feature_bc_matrix_folder/compressed_folder - label: "Compressed folder with aggregated filtered feature-barcode matrices" + label: "Filtered feature barcode matrix, MEX" doc: | - Compressed folder with aggregated filtered feature-barcode matrices containing only cellular barcodes in MEX format + Filtered feature barcode matrix stored + as a CSC sparse matrix in MEX format. + The rows consist of all the gene and + peak features concatenated together + (identical to raw feature barcode + matrix) and the columns are restricted + to those barcodes that are identified + as cells. filtered_feature_bc_matrix_h5: type: File outputSource: aggregate_counts/filtered_feature_bc_matrix_h5 - label: "Aggregated filtered feature-barcode matrices in HDF5 format" + label: "Filtered feature barcode matrix, HDF5" doc: | - Aggregated filtered feature-barcode matrices containing only cellular barcodes in HDF5 format - + Filtered feature barcode matrix stored + as a CSC sparse matrix in hdf5 format. + The rows consist of all the gene and + peak features concatenated together + (identical to raw feature barcode + matrix) and the columns are restricted + to those barcodes that are identified + as cells. + raw_feature_bc_matrices_folder: type: File outputSource: compress_raw_feature_bc_matrices_folder/compressed_folder - label: "Compressed folder with aggregated unfiltered feature-barcode matrices" + label: "Raw feature barcode matrix, MEX" doc: | - Compressed folder with aggregated unfiltered feature-barcode matrices containing all barcodes in MEX format + Raw feature barcode matrix stored as + a CSC sparse matrix in MEX format. + The rows consist of all the gene and + peak features concatenated together + and the columns consist of all observed + barcodes with non-zero signal for + either ATAC or gene expression. raw_feature_bc_matrices_h5: type: File outputSource: aggregate_counts/raw_feature_bc_matrices_h5 - label: "Aggregated unfiltered feature-barcode matrices in HDF5 format" + label: "Raw feature barcode matrix, HDF5" + doc: | + Raw feature barcode matrix stored as + a CSC sparse matrix in hdf5 format. + The rows consist of all the gene and + peak features concatenated together + and the columns consist of all observed + barcodes with non-zero signal for + either ATAC or gene expression. + + secondary_analysis_report_folder: + type: File + outputSource: compress_secondary_analysis_report_folder/compressed_folder + label: "Secondary analysis" doc: | - Aggregated unfiltered feature-barcode matrices containing all barcodes in HDF5 format + Various secondary analyses that + utilize the ATAC, RNA data, and + their linkage: dimensionality + reduction and clustering results + for the ATAC and RNA data, + differential expression, and + differential accessibility for all + clustering results above and linkage + between ATAC and RNA data. loupe_browser_track: type: File outputSource: aggregate_counts/loupe_browser_track - label: "Loupe Browser visualization and analysis file for aggregated results" + label: "Loupe Browser visualization" doc: | - Loupe Browser visualization and analysis file for aggregated results + Loupe Browser visualization file + with all the analysis outputs - aggregation_metadata: + atac_fragments_file: type: File - outputSource: aggregate_counts/aggregation_metadata - label: "Aggregation metadata in CSV format" + outputSource: aggregate_counts/atac_fragments_file + label: "ATAC fragments" + doc: | + Count and barcode information for + every ATAC fragment observed in + the experiment in TSV format. + + atac_peaks_bed_file: + type: File + outputSource: aggregate_counts/atac_peaks_bed_file + label: "ATAC peaks" + doc: | + Locations of open-chromatin regions + identified in this sample. These + regions are referred to as "peaks". + + atac_peak_annotation_file: + type: File + outputSource: aggregate_counts/atac_peak_annotation_file + label: "ATAC peaks annotations" doc: | - Aggregation metadata in CSV format + Annotations of peaks based on + genomic proximity alone. Note, + that these are not functional + annotations and they do not make + use of linkage with RNA data. aggregate_counts_stdout_log: type: File outputSource: aggregate_counts/stdout_log - label: "stdout log generated by cellranger-arc aggr" + label: "Output log, cellranger-arc aggr step" doc: | stdout log generated by cellranger-arc aggr aggregate_counts_stderr_log: type: File outputSource: aggregate_counts/stderr_log - label: "stderr log generated by cellranger-arc aggr" + label: "Error log, cellranger-arc aggr step" doc: | stderr log generated by cellranger-arc aggr - compressed_html_data_folder: - type: File - outputSource: compress_html_data_folder/compressed_folder - label: "Compressed folder with CellBrowser formatted results" - doc: | - Compressed folder with CellBrowser formatted results - html_data_folder: type: Directory outputSource: cellbrowser_build/html_data - label: "Folder with not compressed CellBrowser formatted results" - doc: | - Folder with not compressed CellBrowser formatted results - - cellbrowser_report: - type: File - outputSource: cellbrowser_build/index_html_file - label: "CellBrowser formatted Cellranger report" + label: "UCSC Cell Browser data" doc: | - CellBrowser formatted Cellranger report - 'sd:visualPlugins': - - linkList: - tab: 'Overview' - target: "_blank" + Directory with UCSC Cell Browser + data steps: @@ -232,7 +285,9 @@ steps: gem_well_labels: gem_well_labels indices_folder: indices_folder normalization_mode: normalization_mode - threads: threads + threads: + source: threads + valueFrom: $(parseInt(self)) memory_limit: memory_limit virt_memory_limit: memory_limit out: @@ -282,13 +337,6 @@ steps: - html_data - index_html_file - compress_html_data_folder: - run: ../tools/tar-compress.cwl - in: - folder_to_compress: cellbrowser_build/html_data - out: - - compressed_folder - $namespaces: s: http://schema.org/ @@ -299,7 +347,7 @@ $schemas: label: "Cell Ranger ARC Aggregate" s:name: "Cell Ranger ARC Aggregate" -s:alternateName: "Aggregates data from multiple Cell Ranger ARC Count Gene Expression + ATAC experiments" +s:alternateName: "Aggregates data from multiple Cell Ranger ARC Count Gene Expression and Chromatin Accessibility experiments" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/cellranger-arc-aggr.cwl s:codeRepository: https://github.com/datirium/workflows @@ -338,4 +386,6 @@ s:creator: doc: | Cell Ranger ARC Aggregate - ========================= + + Aggregates data from multiple Cell Ranger ARC Count Gene + Expression and Chromatin Accessibility experiments. From d5fb9f3d126e0f598283cd7c1a95747d2251bff5 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 30 Jun 2023 16:25:39 -0400 Subject: [PATCH 041/162] Update sc tools to use the latest docker image --- tools/sc-atac-cluster.cwl | 2 +- tools/sc-atac-coverage.cwl | 2 +- tools/sc-atac-dbinding.cwl | 2 +- tools/sc-atac-reduce.cwl | 14 +- tools/sc-ctype-assign.cwl | 2 +- tools/sc-multiome-filter.cwl | 347 +++++++++++++++---------------- tools/sc-rna-cluster.cwl | 2 +- tools/sc-rna-da-cells.cwl | 2 +- tools/sc-rna-de-pseudobulk.cwl | 2 +- tools/sc-rna-filter.cwl | 73 +++---- tools/sc-rna-reduce.cwl | 10 +- tools/sc-triangulate.cwl | 2 +- tools/sc-wnn-cluster.cwl | 2 +- workflows/sc-atac-reduce.cwl | 12 +- workflows/sc-multiome-filter.cwl | 206 +++++++++--------- workflows/sc-rna-filter.cwl | 54 ++--- workflows/sc-rna-reduce.cwl | 12 +- 17 files changed, 371 insertions(+), 375 deletions(-) diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index 13e2aaba..292b2965 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.22 + dockerPull: biowardrobe2/sc-tools:v0.0.23 inputs: diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index 87bcf337..e4e4ea8a 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.22 + dockerPull: biowardrobe2/sc-tools:v0.0.23 inputs: diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index 5599ddf8..b1941b2d 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.22 + dockerPull: biowardrobe2/sc-tools:v0.0.23 inputs: diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index bca8d6f3..663d445e 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.22 + dockerPull: biowardrobe2/sc-tools:v0.0.23 inputs: @@ -383,20 +383,20 @@ outputs: Split by grouping condition cells UMAP. PDF format - umap_spl_umi_plot_png: + umap_spl_frgm_plot_png: type: File? outputBinding: - glob: "*_umap_spl_umi.png" + glob: "*_umap_spl_frgm.png" doc: | - Split by the UMI per cell counts cells UMAP. + Split by the fragments in peaks per cell counts cells UMAP. PNG format - umap_spl_umi_plot_pdf: + umap_spl_frgm_plot_pdf: type: File? outputBinding: - glob: "*_umap_spl_umi.pdf" + glob: "*_umap_spl_frgm.pdf" doc: | - Split by the UMI per cell counts cells UMAP. + Split by the fragments in peaks per cell counts cells UMAP. PDF format umap_spl_peak_plot_png: diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index 1df1ec99..717cf4fc 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.22 + dockerPull: biowardrobe2/sc-tools:v0.0.23 inputs: diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index 15672a5d..e83b8945 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.22 + dockerPull: biowardrobe2/sc-tools:v0.0.23 inputs: @@ -150,17 +150,17 @@ inputs: the '--mex' input based on the '--identity' file. Default: 5000 (applied to all datasets) - rna_minimum_umi: + minimum_umis: type: - "null" - int - int[] inputBinding: - prefix: "--rnaminumi" + prefix: "--minumis" doc: | - Include cells where at least this many UMI (RNA transcripts) are detected. - If multiple values provided, each of them will be applied to the correspondent - dataset from the '--mex' input based on the '--identity' file. + Include cells where at least this many UMI (transcripts) are detected. + If multiple values provided, each of them will be applied to the + correspondent dataset from the '--mex' input based on the '--identity' file. Default: 500 (applied to all datasets) mito_pattern: @@ -202,17 +202,18 @@ inputs: Include only peaks detected in at least this many cells. Default: 5 (applied to all datasets) - atac_minimum_umi: + minimum_fragments: type: - "null" - int - int[] inputBinding: - prefix: "--atacminumi" + prefix: "--minfragments" doc: | - Include cells where at least this many UMI (ATAC transcripts) are detected. - If multiple values provided, each of them will be applied to the correspondent - dataset from the '--mex' input based on the '--identity' file. + Include cells where at least this many fragments in peaks are + detected. If multiple values provided, each of them will be + applied to the correspondent dataset from the '--mex' input + based on the '--identity' file. Default: 1000 (applied to all datasets) maximum_nucl_signal: @@ -484,20 +485,20 @@ outputs: Number of cells per dataset (not filtered). PDF format - raw_rna_umi_dnst_plot_png: + raw_umi_dnst_plot_png: type: File? outputBinding: - glob: "*_raw_rna_umi_dnst.png" + glob: "*_raw_umi_dnst.png" doc: | - UMI per cell density for RNA assay (not filtered). + Transcripts per cell density (not filtered). PNG format - raw_rna_umi_dnst_plot_pdf: + raw_umi_dnst_plot_pdf: type: File? outputBinding: - glob: "*_raw_rna_umi_dnst.pdf" + glob: "*_raw_umi_dnst.pdf" doc: | - UMI per cell density for RNA assay (not filtered). + Transcripts per cell density (not filtered). PDF format raw_gene_dnst_plot_png: @@ -516,20 +517,20 @@ outputs: Genes per cell density (not filtered). PDF format - raw_gene_umi_corr_plot_png: + raw_gene_umi_plot_png: type: File? outputBinding: - glob: "*_raw_gene_umi_corr.png" + glob: "*_raw_gene_umi.png" doc: | - Genes vs UMI per cell correlation for RNA assay (not filtered). + Genes vs transcripts per cell (not filtered). PNG format - raw_gene_umi_corr_plot_pdf: + raw_gene_umi_plot_pdf: type: File? outputBinding: - glob: "*_raw_gene_umi_corr.pdf" + glob: "*_raw_gene_umi.pdf" doc: | - Genes vs UMI per cell correlation for RNA assay (not filtered). + Genes vs transcripts per cell (not filtered). PDF format raw_mito_dnst_plot_png: @@ -564,20 +565,20 @@ outputs: Novelty score per cell density for RNA assay (not filtered). PDF format - raw_atac_umi_dnst_plot_png: + raw_frgm_dnst_plot_png: type: File? outputBinding: - glob: "*_raw_atac_umi_dnst.png" + glob: "*_raw_frgm_dnst.png" doc: | - UMI per cell density for ATAC assay (not filtered). + Fragments in peaks per cell density (not filtered). PNG format - raw_atac_umi_dnst_plot_pdf: + raw_frgm_dnst_plot_pdf: type: File? outputBinding: - glob: "*_raw_atac_umi_dnst.pdf" + glob: "*_raw_frgm_dnst.pdf" doc: | - UMI per cell density for ATAC assay (not filtered). + Fragments in peaks per cell density (not filtered). PDF format raw_peak_dnst_plot_png: @@ -612,36 +613,36 @@ outputs: Fraction of ATAC fragments within genomic blacklist regions per cell density (not filtered). PDF format - raw_rna_atac_umi_corr_plot_png: + raw_rna_atac_cnts_plot_png: type: File? outputBinding: - glob: "*_raw_rna_atac_umi_corr.png" + glob: "*_raw_rna_atac_cnts.png" doc: | - UMI per cell correlation for RNA vs ATAC assays (not filtered). + Transcripts vs fragments in peaks per cell (not filtered). PNG format - raw_rna_atac_umi_corr_plot_pdf: + raw_rna_atac_cnts_plot_pdf: type: File? outputBinding: - glob: "*_raw_rna_atac_umi_corr.pdf" + glob: "*_raw_rna_atac_cnts.pdf" doc: | - UMI per cell correlation for RNA vs ATAC assays (not filtered). + Transcripts vs fragments in peaks per cell (not filtered). PDF format - raw_tss_atac_umi_corr_plot_png: + raw_tss_frgm_plot_png: type: File? outputBinding: - glob: "*_raw_tss_atac_umi_corr.png" + glob: "*_raw_tss_frgm.png" doc: | - TSS enrichment score vs UMI per cell correlation for ATAC assay (not filtered). + TSS enrichment score vs fragments in peaks per cell (not filtered). PNG format - raw_tss_atac_umi_corr_plot_pdf: + raw_tss_frgm_plot_pdf: type: File? outputBinding: - glob: "*_raw_tss_atac_umi_corr.pdf" + glob: "*_raw_tss_frgm.pdf" doc: | - TSS enrichment score vs UMI per cell correlation for ATAC assay (not filtered). + TSS enrichment score vs fragments in peaks per cell (not filtered). PDF format raw_qc_mtrcs_dnst_plot_png: @@ -740,20 +741,20 @@ outputs: Fragments length histogram (not filtered). PDF format - raw_rna_umi_dnst_spl_cnd_plot_png: + raw_umi_dnst_spl_cnd_plot_png: type: File? outputBinding: - glob: "*_raw_rna_umi_dnst_spl_cnd.png" + glob: "*_raw_umi_dnst_spl_cnd.png" doc: | - Split by grouping condition UMI per cell density for RNA assay (not filtered). + Split by grouping condition transcripts per cell density (not filtered). PNG format - raw_rna_umi_dnst_spl_cnd_plot_pdf: + raw_umi_dnst_spl_cnd_plot_pdf: type: File? outputBinding: - glob: "*_raw_rna_umi_dnst_spl_cnd.pdf" + glob: "*_raw_umi_dnst_spl_cnd.pdf" doc: | - Split by grouping condition UMI per cell density for RNA assay (not filtered). + Split by grouping condition transcripts per cell density (not filtered). PDF format raw_gene_dnst_spl_cnd_plot_png: @@ -806,20 +807,20 @@ outputs: Split by grouping condition the novelty score per cell density for RNA assay (not filtered). PDF format - raw_atac_umi_dnst_spl_cnd_plot_png: + raw_frgm_dnst_spl_cnd_plot_png: type: File? outputBinding: - glob: "*_raw_atac_umi_dnst_spl_cnd.png" + glob: "*_raw_frgm_dnst_spl_cnd.png" doc: | - Split by grouping condition UMI per cell density for ATAC assay (not filtered). + Split by grouping condition fragments in peaks per cell density (not filtered). PNG format - raw_atac_umi_dnst_spl_cnd_plot_pdf: + raw_frgm_dnst_spl_cnd_plot_pdf: type: File? outputBinding: - glob: "*_raw_atac_umi_dnst_spl_cnd.pdf" + glob: "*_raw_frgm_dnst_spl_cnd.pdf" doc: | - Split by grouping condition UMI per cell density for ATAC assay (not filtered). + Split by grouping condition fragments in peaks per cell density (not filtered). PDF format raw_peak_dnst_spl_cnd_plot_png: @@ -904,20 +905,20 @@ outputs: Number of cells per dataset (intermediate filtered). PDF format - mid_fltr_rna_umi_dnst_plot_png: + mid_fltr_umi_dnst_plot_png: type: File? outputBinding: - glob: "*_mid_fltr_rna_umi_dnst.png" + glob: "*_mid_fltr_umi_dnst.png" doc: | - UMI per cell density for RNA assay (intermediate filtered). + Transcripts per cell density (intermediate filtered). PNG format - mid_fltr_rna_umi_dnst_plot_pdf: + mid_fltr_umi_dnst_plot_pdf: type: File? outputBinding: - glob: "*_mid_fltr_rna_umi_dnst.pdf" + glob: "*_mid_fltr_umi_dnst.pdf" doc: | - UMI per cell density for RNA assay (intermediate filtered). + Transcripts per cell density (intermediate filtered). PDF format mid_fltr_gene_dnst_plot_png: @@ -936,20 +937,20 @@ outputs: Genes per cell density (intermediate filtered). PDF format - mid_fltr_gene_umi_corr_plot_png: + mid_fltr_gene_umi_plot_png: type: File? outputBinding: - glob: "*_mid_fltr_gene_umi_corr.png" + glob: "*_mid_fltr_gene_umi.png" doc: | - Genes vs UMI per cell correlation for RNA assay (intermediate filtered). + Genes vs transcripts per cell (intermediate filtered). PNG format - mid_fltr_gene_umi_corr_plot_pdf: + mid_fltr_gene_umi_plot_pdf: type: File? outputBinding: - glob: "*_mid_fltr_gene_umi_corr.pdf" + glob: "*_mid_fltr_gene_umi.pdf" doc: | - Genes vs UMI per cell correlation for RNA assay (intermediate filtered). + Genes vs transcripts per cell (intermediate filtered). PDF format mid_fltr_mito_dnst_plot_png: @@ -984,20 +985,20 @@ outputs: Novelty score per cell density for RNA assay (intermediate filtered). PDF format - mid_fltr_atac_umi_dnst_plot_png: + mid_fltr_frgm_dnst_plot_png: type: File? outputBinding: - glob: "*_mid_fltr_atac_umi_dnst.png" + glob: "*_mid_fltr_frgm_dnst.png" doc: | - UMI per cell density for ATAC assay (intermediate filtered). + Fragments in peaks per cell density (intermediate filtered). PNG format - mid_fltr_atac_umi_dnst_plot_pdf: + mid_fltr_frgm_dnst_plot_pdf: type: File? outputBinding: - glob: "*_mid_fltr_atac_umi_dnst.pdf" + glob: "*_mid_fltr_frgm_dnst.pdf" doc: | - UMI per cell density for ATAC assay (intermediate filtered). + Fragments in peaks per cell density (intermediate filtered). PDF format mid_fltr_peak_dnst_plot_png: @@ -1032,36 +1033,36 @@ outputs: Fraction of ATAC fragments within genomic blacklist regions per cell density (intermediate filtered). PDF format - mid_fltr_rna_atac_umi_corr_plot_png: + mid_fltr_rna_atac_cnts_plot_png: type: File? outputBinding: - glob: "*_mid_fltr_rna_atac_umi_corr.png" + glob: "*_mid_fltr_rna_atac_cnts.png" doc: | - UMI per cell correlation for RNA vs ATAC assays (intermediate filtered). + Transcripts vs fragments in peaks per cell (intermediate filtered). PNG format - mid_fltr_rna_atac_umi_corr_plot_pdf: + mid_fltr_rna_atac_cnts_plot_pdf: type: File? outputBinding: - glob: "*_mid_fltr_rna_atac_umi_corr.pdf" + glob: "*_mid_fltr_rna_atac_cnts.pdf" doc: | - UMI per cell correlation for RNA vs ATAC assays (intermediate filtered). + Transcripts vs fragments in peaks per cell (intermediate filtered). PDF format - mid_fltr_tss_atac_umi_corr_plot_png: + mid_fltr_tss_frgm_plot_png: type: File? outputBinding: - glob: "*_mid_fltr_tss_atac_umi_corr.png" + glob: "*_mid_fltr_tss_frgm.png" doc: | - TSS enrichment score vs UMI per cell correlation for ATAC assay (intermediate filtered). + TSS enrichment score vs fragments in peaks per cell (intermediate filtered). PNG format - mid_fltr_tss_atac_umi_corr_plot_pdf: + mid_fltr_tss_frgm_plot_pdf: type: File? outputBinding: - glob: "*_mid_fltr_tss_atac_umi_corr.pdf" + glob: "*_mid_fltr_tss_frgm.pdf" doc: | - TSS enrichment score vs UMI per cell correlation for ATAC assay (intermediate filtered). + TSS enrichment score vs fragments in peaks per cell (intermediate filtered). PDF format mid_fltr_qc_mtrcs_dnst_plot_png: @@ -1160,20 +1161,20 @@ outputs: Fragments length histogram (intermediate filtered). PDF format - mid_fltr_rna_umi_dnst_spl_cnd_plot_png: + mid_fltr_umi_dnst_spl_cnd_plot_png: type: File? outputBinding: - glob: "*_mid_fltr_rna_umi_dnst_spl_cnd.png" + glob: "*_mid_fltr_umi_dnst_spl_cnd.png" doc: | - Split by grouping condition UMI per cell density for RNA assay (intermediate filtered). + Split by grouping condition transcripts per cell density (intermediate filtered). PNG format - mid_fltr_rna_umi_dnst_spl_cnd_plot_pdf: + mid_fltr_umi_dnst_spl_cnd_plot_pdf: type: File? outputBinding: - glob: "*_mid_fltr_rna_umi_dnst_spl_cnd.pdf" + glob: "*_mid_fltr_umi_dnst_spl_cnd.pdf" doc: | - Split by grouping condition UMI per cell density for RNA assay (intermediate filtered). + Split by grouping condition transcripts per cell density (intermediate filtered). PDF format mid_fltr_gene_dnst_spl_cnd_plot_png: @@ -1226,20 +1227,20 @@ outputs: Split by grouping condition the novelty score per cell density for RNA assay (intermediate filtered). PDF format - mid_fltr_atac_umi_dnst_spl_cnd_plot_png: + mid_fltr_frgm_dnst_spl_cnd_plot_png: type: File? outputBinding: - glob: "*_mid_fltr_atac_umi_dnst_spl_cnd.png" + glob: "*_mid_fltr_frgm_dnst_spl_cnd.png" doc: | - Split by grouping condition UMI per cell density for ATAC assay (intermediate filtered). + Split by grouping condition fragments in peaks per cell density (intermediate filtered). PNG format - mid_fltr_atac_umi_dnst_spl_cnd_plot_pdf: + mid_fltr_frgm_dnst_spl_cnd_plot_pdf: type: File? outputBinding: - glob: "*_mid_fltr_atac_umi_dnst_spl_cnd.pdf" + glob: "*_mid_fltr_frgm_dnst_spl_cnd.pdf" doc: | - Split by grouping condition UMI per cell density for ATAC assay (intermediate filtered). + Split by grouping condition fragments in peaks per cell density (intermediate filtered). PDF format mid_fltr_peak_dnst_spl_cnd_plot_png: @@ -1324,20 +1325,20 @@ outputs: Number of cells per dataset (filtered). PDF format - fltr_rna_umi_dnst_plot_png: + fltr_umi_dnst_plot_png: type: File? outputBinding: - glob: "*[!_mid]_fltr_rna_umi_dnst.png" + glob: "*[!_mid]_fltr_umi_dnst.png" doc: | - UMI per cell density for RNA assay (filtered). + Transcripts per cell density (filtered). PNG format - fltr_rna_umi_dnst_plot_pdf: + fltr_umi_dnst_plot_pdf: type: File? outputBinding: - glob: "*[!_mid]_fltr_rna_umi_dnst.pdf" + glob: "*[!_mid]_fltr_umi_dnst.pdf" doc: | - UMI per cell density for RNA assay (filtered). + Transcripts per cell density (filtered). PDF format fltr_gene_dnst_plot_png: @@ -1356,20 +1357,20 @@ outputs: Genes per cell density (filtered). PDF format - fltr_gene_umi_corr_plot_png: + fltr_gene_umi_plot_png: type: File? outputBinding: - glob: "*[!_mid]_fltr_gene_umi_corr.png" + glob: "*[!_mid]_fltr_gene_umi.png" doc: | - Genes vs UMI per cell correlation for RNA assay (filtered). + Genes vs transcripts per cell (filtered). PNG format - fltr_gene_umi_corr_plot_pdf: + fltr_gene_umi_plot_pdf: type: File? outputBinding: - glob: "*[!_mid]_fltr_gene_umi_corr.pdf" + glob: "*[!_mid]_fltr_gene_umi.pdf" doc: | - Genes vs UMI per cell correlation for RNA assay (filtered). + Genes vs transcripts per cell (filtered). PDF format fltr_mito_dnst_plot_png: @@ -1404,20 +1405,20 @@ outputs: Novelty score per cell density for RNA assay (filtered). PDF format - fltr_atac_umi_dnst_plot_png: + fltr_frgm_dnst_plot_png: type: File? outputBinding: - glob: "*[!_mid]_fltr_atac_umi_dnst.png" + glob: "*[!_mid]_fltr_frgm_dnst.png" doc: | - UMI per cell density for ATAC assay (filtered). + Fragments in peaks per cell density (filtered). PNG format - fltr_atac_umi_dnst_plot_pdf: + fltr_frgm_dnst_plot_pdf: type: File? outputBinding: - glob: "*[!_mid]_fltr_atac_umi_dnst.pdf" + glob: "*[!_mid]_fltr_frgm_dnst.pdf" doc: | - UMI per cell density for ATAC assay (filtered). + Fragments in peaks per cell density (filtered). PDF format fltr_peak_dnst_plot_png: @@ -1452,20 +1453,20 @@ outputs: Fraction of ATAC fragments within genomic blacklist regions per cell density (filtered). PDF format - fltr_rna_atac_umi_corr_plot_png: + fltr_rna_atac_cnts_plot_png: type: File? outputBinding: - glob: "*[!_mid]_fltr_rna_atac_umi_corr.png" + glob: "*[!_mid]_fltr_rna_atac_cnts.png" doc: | - UMI per cell correlation for RNA vs ATAC assays (filtered). + Transcripts vs fragments in peaks per cell (filtered). PNG format - fltr_rna_atac_umi_corr_plot_pdf: + fltr_rna_atac_cnts_plot_pdf: type: File? outputBinding: - glob: "*[!_mid]_fltr_rna_atac_umi_corr.pdf" + glob: "*[!_mid]_fltr_rna_atac_cnts.pdf" doc: | - UMI per cell correlation for RNA vs ATAC assays (filtered). + Transcripts vs fragments in peaks per cell (filtered). PDF format fltr_rnadbl_plot_png: @@ -1516,20 +1517,20 @@ outputs: Doublets overlap for RNA and ATAC assays per dataset (filtered). PDF format - fltr_tss_atac_umi_corr_plot_png: + fltr_tss_frgm_plot_png: type: File? outputBinding: - glob: "*[!_mid]_fltr_tss_atac_umi_corr.png" + glob: "*[!_mid]_fltr_tss_frgm.png" doc: | - TSS enrichment score vs UMI per cell correlation for ATAC assay (filtered). + TSS enrichment score vs fragments in peaks per cell (filtered). PNG format - fltr_tss_atac_umi_corr_plot_pdf: + fltr_tss_frgm_plot_pdf: type: File? outputBinding: - glob: "*[!_mid]_fltr_tss_atac_umi_corr.pdf" + glob: "*[!_mid]_fltr_tss_frgm.pdf" doc: | - TSS enrichment score vs UMI per cell correlation for ATAC assay (filtered). + TSS enrichment score vs fragments in peaks per cell (filtered). PDF format fltr_qc_mtrcs_dnst_plot_png: @@ -1580,20 +1581,20 @@ outputs: Fragments length histogram (filtered). PDF format - fltr_rna_umi_dnst_spl_cnd_plot_png: + fltr_umi_dnst_spl_cnd_plot_png: type: File? outputBinding: - glob: "*[!_mid]_fltr_rna_umi_dnst_spl_cnd.png" + glob: "*[!_mid]_fltr_umi_dnst_spl_cnd.png" doc: | - Split by grouping condition UMI per cell density for RNA assay (filtered). + Split by grouping condition transcripts per cell density (filtered). PNG format - fltr_rna_umi_dnst_spl_cnd_plot_pdf: + fltr_umi_dnst_spl_cnd_plot_pdf: type: File? outputBinding: - glob: "*[!_mid]_fltr_rna_umi_dnst_spl_cnd.pdf" + glob: "*[!_mid]_fltr_umi_dnst_spl_cnd.pdf" doc: | - Split by grouping condition UMI per cell density for RNA assay (filtered). + Split by grouping condition transcripts per cell density (filtered). PDF format fltr_gene_dnst_spl_cnd_plot_png: @@ -1646,20 +1647,20 @@ outputs: Split by grouping condition the novelty score per cell density for RNA assay (filtered). PDF format - fltr_atac_umi_dnst_spl_cnd_plot_png: + fltr_frgm_dnst_spl_cnd_plot_png: type: File? outputBinding: - glob: "*[!_mid]_fltr_atac_umi_dnst_spl_cnd.png" + glob: "*[!_mid]_fltr_frgm_dnst_spl_cnd.png" doc: | - Split by grouping condition UMI per cell density for ATAC assay (filtered). + Split by grouping condition fragments in peaks per cell density (filtered). PNG format - fltr_atac_umi_dnst_spl_cnd_plot_pdf: + fltr_frgm_dnst_spl_cnd_plot_pdf: type: File? outputBinding: - glob: "*[!_mid]_fltr_atac_umi_dnst_spl_cnd.pdf" + glob: "*[!_mid]_fltr_frgm_dnst_spl_cnd.pdf" doc: | - Split by grouping condition UMI per cell density for ATAC assay (filtered). + Split by grouping condition fragments in peaks per cell density (filtered). PDF format fltr_peak_dnst_spl_cnd_plot_png: @@ -1815,35 +1816,30 @@ doc: | s:about: | - usage: sc_multiome_filter.R [-h] --mex MEX --identity IDENTITY - --fragments FRAGMENTS --annotations - ANNOTATIONS --seqinfo SEQINFO - [--grouping GROUPING] - [--blacklist BLACKLIST] - [--barcodes BARCODES] - [--rnamincells RNAMINCELLS] - [--mingenes [MINGENES [MINGENES ...]]] - [--maxgenes [MAXGENES [MAXGENES ...]]] - [--rnaminumi [RNAMINUMI [RNAMINUMI ...]]] - [--mitopattern MITOPATTERN] - [--maxmt MAXMT] - [--minnovelty [MINNOVELTY [MINNOVELTY ...]]] - [--atacmincells ATACMINCELLS] - [--atacminumi [ATACMINUMI [ATACMINUMI ...]]] - [--maxnuclsignal [MAXNUCLSIGNAL [MAXNUCLSIGNAL ...]]] - [--mintssenrich [MINTSSENRICH [MINTSSENRICH ...]]] - [--minfrip [MINFRIP [MINFRIP ...]]] - [--maxblacklist [MAXBLACKLIST [MAXBLACKLIST ...]]] - [--callby CALLBY] - [--removedoublets {union,onlyrna,onlyatac,intersect}] - [--rnadbr RNADBR] - [--rnadbrsd RNADBRSD] - [--atacdbr ATACDBR] - [--atacdbrsd ATACDBRSD] [--pdf] - [--verbose] [--h5seurat] [--h5ad] - [--cbbuild] [--output OUTPUT] - [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] - [--cpus CPUS] [--memory MEMORY] + usage: sc_multiome_filter.R [-h] --mex MEX --identity IDENTITY --fragments + FRAGMENTS --annotations ANNOTATIONS --seqinfo + SEQINFO [--grouping GROUPING] + [--blacklist BLACKLIST] [--barcodes BARCODES] + [--rnamincells RNAMINCELLS] + [--mingenes [MINGENES [MINGENES ...]]] + [--maxgenes [MAXGENES [MAXGENES ...]]] + [--minumis [MINUMIS [MINUMIS ...]]] + [--mitopattern MITOPATTERN] [--maxmt MAXMT] + [--minnovelty [MINNOVELTY [MINNOVELTY ...]]] + [--atacmincells ATACMINCELLS] + [--minfragments [MINFRAGMENTS [MINFRAGMENTS ...]]] + [--maxnuclsignal [MAXNUCLSIGNAL [MAXNUCLSIGNAL ...]]] + [--mintssenrich [MINTSSENRICH [MINTSSENRICH ...]]] + [--minfrip [MINFRIP [MINFRIP ...]]] + [--maxblacklist [MAXBLACKLIST [MAXBLACKLIST ...]]] + [--callby CALLBY] + [--removedoublets {union,onlyrna,onlyatac,intersect}] + [--rnadbr RNADBR] [--rnadbrsd RNADBRSD] + [--atacdbr ATACDBR] [--atacdbrsd ATACDBRSD] + [--pdf] [--verbose] [--h5seurat] [--h5ad] + [--cbbuild] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] Single-cell Multiome ATAC and RNA-Seq Filtering Analysis @@ -1899,9 +1895,9 @@ s:about: | will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. Default: 5000 (applied to all datasets) - --rnaminumi [RNAMINUMI [RNAMINUMI ...]] - Include cells where at least this many UMI (RNA - transcripts) are detected. If multiple values + --minumis [MINUMIS [MINUMIS ...]] + Include cells where at least this many UMI + (transcripts) are detected. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. Default: 500 (applied to all @@ -1922,13 +1918,12 @@ s:about: | --atacmincells ATACMINCELLS Include only peaks detected in at least this many cells. Default: 5 (applied to all datasets) - --atacminumi [ATACMINUMI [ATACMINUMI ...]] - Include cells where at least this many UMI (ATAC - transcripts) are detected. If multiple values - provided, each of them will be applied to the - correspondent dataset from the '--mex' input based on - the '--identity' file. Default: 1000 (applied to all - datasets) + --minfragments [MINFRAGMENTS [MINFRAGMENTS ...]] + Include cells where at least this many fragments in + peaks are detected. If multiple values provided, each + of them will be applied to the correspondent dataset + from the '--mex' input based on the '--identity' file. + Default: 1000 (applied to all datasets) --maxnuclsignal [MAXNUCLSIGNAL [MAXNUCLSIGNAL ...]] Include cells with the nucleosome signal not bigger than this value. Nucleosome signal quantifies the diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index d75ee916..d467f0a4 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.22 + dockerPull: biowardrobe2/sc-tools:v0.0.23 inputs: diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index 3701ef9c..0521b0bc 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.22 + dockerPull: biowardrobe2/sc-tools:v0.0.23 inputs: diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index be87a1c6..2792f20e 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.22 + dockerPull: biowardrobe2/sc-tools:v0.0.23 inputs: diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index 8c05b295..5ecb6232 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.22 + dockerPull: biowardrobe2/sc-tools:v0.0.23 inputs: @@ -95,13 +95,13 @@ inputs: the '--mex' input based on the '--identity' file. Default: 5000 (applied to all datasets) - rna_minimum_umi: + minimum_umis: type: - "null" - int - int[] inputBinding: - prefix: "--rnaminumi" + prefix: "--minumis" doc: | Include cells where at least this many UMI (transcripts) are detected. If multiple values provided, each of them will be applied to the correspondent @@ -314,7 +314,7 @@ outputs: outputBinding: glob: "*_raw_umi_dnst.png" doc: | - UMI per cell density (not filtered). + Transcripts per cell density (not filtered). PNG format raw_umi_dnst_plot_pdf: @@ -322,7 +322,7 @@ outputs: outputBinding: glob: "*_raw_umi_dnst.pdf" doc: | - UMI per cell density (not filtered). + Transcripts per cell density (not filtered). PDF format raw_gene_dnst_plot_png: @@ -341,20 +341,20 @@ outputs: Genes per cell density (not filtered). PDF format - raw_gene_umi_corr_plot_png: + raw_gene_umi_plot_png: type: File? outputBinding: - glob: "*_raw_gene_umi_corr.png" + glob: "*_raw_gene_umi.png" doc: | - Genes vs UMI per cell correlation (not filtered). + Genes vs transcripts per cell correlation (not filtered). PNG format - raw_gene_umi_corr_plot_pdf: + raw_gene_umi_plot_pdf: type: File? outputBinding: - glob: "*_raw_gene_umi_corr.pdf" + glob: "*_raw_gene_umi.pdf" doc: | - Genes vs UMI per cell correlation (not filtered). + Genes vs transcripts per cell correlation (not filtered). PDF format raw_mito_dnst_plot_png: @@ -426,7 +426,7 @@ outputs: outputBinding: glob: "*_raw_umi_dnst_spl_cnd.png" doc: | - Split by grouping condition UMI per cell density (not filtered). + Split by grouping condition transcripts per cell density (not filtered). PNG format raw_umi_dnst_spl_cnd_plot_pdf: @@ -434,7 +434,7 @@ outputs: outputBinding: glob: "*_raw_umi_dnst_spl_cnd.pdf" doc: | - Split by grouping condition UMI per cell density (not filtered). + Split by grouping condition transcripts per cell density (not filtered). PDF format raw_gene_dnst_spl_cnd_plot_png: @@ -540,7 +540,7 @@ outputs: outputBinding: glob: "*_fltr_umi_dnst.png" doc: | - UMI per cell density (filtered). + Transcripts per cell density (filtered). PNG format fltr_umi_dnst_plot_pdf: @@ -548,7 +548,7 @@ outputs: outputBinding: glob: "*_fltr_umi_dnst.pdf" doc: | - UMI per cell density (filtered). + Transcripts per cell density (filtered). PDF format fltr_gene_dnst_plot_png: @@ -567,20 +567,20 @@ outputs: Genes per cell density (filtered). PDF format - fltr_gene_umi_corr_plot_png: + fltr_gene_umi_plot_png: type: File? outputBinding: - glob: "*_fltr_gene_umi_corr.png" + glob: "*_fltr_gene_umi.png" doc: | - Genes vs UMI per cell correlation (filtered). + Genes vs transcripts per cell correlation (filtered). PNG format - fltr_gene_umi_corr_plot_pdf: + fltr_gene_umi_plot_pdf: type: File? outputBinding: - glob: "*_fltr_gene_umi_corr.pdf" + glob: "*_fltr_gene_umi.pdf" doc: | - Genes vs UMI per cell correlation (filtered). + Genes vs transcripts per cell correlation (filtered). PDF format fltr_mito_dnst_plot_png: @@ -652,7 +652,7 @@ outputs: outputBinding: glob: "*_fltr_umi_dnst_spl_cnd.png" doc: | - Split by grouping condition UMI per cell density (filtered). + Split by grouping condition transcripts per cell density (filtered). PNG format fltr_umi_dnst_spl_cnd_plot_pdf: @@ -660,7 +660,7 @@ outputs: outputBinding: glob: "*_fltr_umi_dnst_spl_cnd.pdf" doc: | - Split by grouping condition UMI per cell density (filtered). + Split by grouping condition transcripts per cell density (filtered). PDF format fltr_gene_dnst_spl_cnd_plot_png: @@ -832,18 +832,19 @@ doc: | s:about: | - usage: sc_rna_filter.R - [-h] --mex MEX [MEX ...] --identity IDENTITY [--grouping GROUPING] - [--barcodes BARCODES] [--rnamincells RNAMINCELLS] - [--mingenes [MINGENES [MINGENES ...]]] - [--maxgenes [MAXGENES [MAXGENES ...]]] - [--rnaminumi [RNAMINUMI [RNAMINUMI ...]]] - [--minnovelty [MINNOVELTY [MINNOVELTY ...]]] - [--mitopattern MITOPATTERN] [--maxmt MAXMT] [--removedoublets] - [--rnadbr RNADBR] [--rnadbrsd RNADBRSD] [--pdf] [--verbose] - [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] - [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] - [--cpus CPUS] [--memory MEMORY] + usage: sc_rna_filter.R [-h] --mex MEX [MEX ...] --identity IDENTITY + [--grouping GROUPING] [--barcodes BARCODES] + [--rnamincells RNAMINCELLS] + [--mingenes [MINGENES [MINGENES ...]]] + [--maxgenes [MAXGENES [MAXGENES ...]]] + [--minumis [MINUMIS [MINUMIS ...]]] + [--minnovelty [MINNOVELTY [MINNOVELTY ...]]] + [--mitopattern MITOPATTERN] [--maxmt MAXMT] + [--removedoublets] [--rnadbr RNADBR] + [--rnadbrsd RNADBRSD] [--pdf] [--verbose] [--h5seurat] + [--h5ad] [--cbbuild] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] Single-cell RNA-Seq Filtering Analysis @@ -894,7 +895,7 @@ s:about: | will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. Default: 5000 (applied to all datasets) - --rnaminumi [RNAMINUMI [RNAMINUMI ...]] + --minumis [MINUMIS [MINUMIS ...]] Include cells where at least this many UMI (transcripts) are detected. If multiple values provided, each of them will be applied to the diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index 00c3d1d3..4afeda9c 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.22 + dockerPull: biowardrobe2/sc-tools:v0.0.23 inputs: @@ -503,7 +503,7 @@ outputs: outputBinding: glob: "*_umap_spl_umi.png" doc: | - Split by the UMI per cell counts cells UMAP. + Split by the transcripts per cell counts cells UMAP. PNG format umap_spl_umi_plot_pdf: @@ -511,7 +511,7 @@ outputs: outputBinding: glob: "*_umap_spl_umi.pdf" doc: | - Split by the UMI per cell counts cells UMAP. + Split by the transcripts per cell counts cells UMAP. PDF format umap_spl_gene_plot_png: @@ -631,7 +631,7 @@ outputs: outputBinding: glob: "*_umap_gr_cnd_spl_umi.png" doc: | - Grouped by condition split by the UMI per cell counts cells UMAP. + Grouped by condition split by the transcripts per cell counts cells UMAP. PNG format umap_gr_cnd_spl_umi_plot_pdf: @@ -639,7 +639,7 @@ outputs: outputBinding: glob: "*_umap_gr_cnd_spl_umi.pdf" doc: | - Grouped by condition split by the UMI per cell counts cells UMAP. + Grouped by condition split by the transcripts per cell counts cells UMAP. PDF format umap_gr_cnd_spl_gene_plot_png: diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index 304828a2..1c83645b 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.22 + dockerPull: biowardrobe2/sc-tools:v0.0.23 inputs: diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index 01617d62..d9a0d4a8 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.22 + dockerPull: biowardrobe2/sc-tools:v0.0.23 inputs: diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index 625f2f5e..3531e177 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -332,17 +332,17 @@ outputs: tab: 'Per dataset' Caption: 'Split by dataset cells UMAP' - umap_spl_umi_plot_png: + umap_spl_frgm_plot_png: type: File? - outputSource: sc_atac_reduce/umap_spl_umi_plot_png - label: "Split by the UMI per cell counts cells UMAP" + outputSource: sc_atac_reduce/umap_spl_frgm_plot_png + label: "Split by the fragments in peaks per cell counts cells UMAP" doc: | - Split by the UMI per cell counts cells UMAP. + Split by the fragments in peaks per cell counts cells UMAP. PNG format 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by the UMI per cell counts cells UMAP' + Caption: 'Split by the fragments in peaks per cell counts cells UMAP' umap_spl_peak_plot_png: type: File? @@ -481,7 +481,7 @@ steps: - umap_plot_png - umap_spl_idnt_plot_png - umap_spl_cnd_plot_png - - umap_spl_umi_plot_png + - umap_spl_frgm_plot_png - umap_spl_peak_plot_png - umap_spl_tss_plot_png - umap_spl_ncls_plot_png diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index 5b9a4956..1861266b 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -125,15 +125,15 @@ inputs: "sd:layout": advanced: true - rna_minimum_umi: + minimum_umis: type: string? default: "500" - label: "Minimum number of RNA UMI counts per cell" + label: "Minimum number of transcripts per cell" doc: | Quality control filtering threshold to exclude from the analysis all - cells with the number of RNA UMI - counts smaller than the provided value. + cells with the number of transcripts + smaller than the provided value. If the selected "Cell Ranger ARC Sample" includes multiple aggregated datasets, each of them can be filtered @@ -250,17 +250,17 @@ inputs: "sd:layout": advanced: true - atac_minimum_umi: + minimum_fragments: type: string? default: "1000" - label: "Minimum number of ATAC UMI counts per cell" + label: "Minimum number of fragments in peaks per cell" doc: | Quality control filtering threshold to exclude from the analysis all - cells with the number of ATAC UMI - counts smaller than the provided value. - If the selected "Cell Ranger ARC - Sample" includes multiple aggregated + cells with the number of fragments + in peaks smaller than the provided + value. If the selected "Cell Ranger + ARC Sample" includes multiple aggregated datasets, each of them can be filtered independently by providing comma or space-separated list of filtering @@ -460,17 +460,17 @@ outputs: tab: "Raw" Caption: "Cells per dataset" - raw_rna_umi_dnst_plot_png: + raw_umi_dnst_plot_png: type: File? - outputSource: sc_multiome_filter/raw_rna_umi_dnst_plot_png - label: "RNA UMI per cell, raw" + outputSource: sc_multiome_filter/raw_umi_dnst_plot_png + label: "Transcripts per cell, raw" doc: | - RNA UMI per cell density + Transcripts per cell density for raw data "sd:visualPlugins": - image: tab: "Raw" - Caption: "RNA UMI per cell" + Caption: "Transcripts per cell" raw_gene_dnst_plot_png: type: File? @@ -484,17 +484,17 @@ outputs: tab: "Raw" Caption: "Genes per cell" - raw_gene_umi_corr_plot_png: + raw_gene_umi_plot_png: type: File? - outputSource: sc_multiome_filter/raw_gene_umi_corr_plot_png - label: "Genes vs RNA UMI, raw" + outputSource: sc_multiome_filter/raw_gene_umi_plot_png + label: "Genes vs transcripts, raw" doc: | - Genes vs RNA UMI per cell + Genes vs transcripts per cell for raw data "sd:visualPlugins": - image: tab: "Raw" - Caption: "Genes vs RNA UMI" + Caption: "Genes vs transcripts" raw_mito_dnst_plot_png: type: File? @@ -521,17 +521,17 @@ outputs: tab: "Raw" Caption: "Novelty score" - raw_atac_umi_dnst_plot_png: + raw_frgm_dnst_plot_png: type: File? - outputSource: sc_multiome_filter/raw_atac_umi_dnst_plot_png - label: "ATAC UMI per cell, raw" + outputSource: sc_multiome_filter/raw_frgm_dnst_plot_png + label: "Fragments in peaks per cell, raw" doc: | - ATAC UMI per cell density + Fragments in peaks per cell density for raw data "sd:visualPlugins": - image: tab: "Raw" - Caption: "ATAC UMI per cell" + Caption: "Fragments in peaks per cell" raw_peak_dnst_plot_png: type: File? @@ -558,29 +558,29 @@ outputs: tab: "Raw" Caption: "Blacklist regions fraction" - raw_rna_atac_umi_corr_plot_png: + raw_rna_atac_cnts_plot_png: type: File? - outputSource: sc_multiome_filter/raw_rna_atac_umi_corr_plot_png - label: "RNA UMI vs ATAC UMI, raw" + outputSource: sc_multiome_filter/raw_rna_atac_cnts_plot_png + label: "Transcripts vs fragments in peaks, raw" doc: | - RNA UMI per cell vs ATAC UMI + Transcripts vs fragments in peaks per cell for raw data "sd:visualPlugins": - image: tab: "Raw" - Caption: "RNA UMI vs ATAC UMI" + Caption: "Transcripts vs fragments in peaks" - raw_tss_atac_umi_corr_plot_png: + raw_tss_frgm_plot_png: type: File? - outputSource: sc_multiome_filter/raw_tss_atac_umi_corr_plot_png - label: "TSS enrichment vs ATAC UMI, raw" + outputSource: sc_multiome_filter/raw_tss_frgm_plot_png + label: "TSS enrichment score vs fragments in peaks, raw" doc: | - TSS enrichment score vs ATAC UMI + TSS enrichment score vs fragments in peaks per cell for raw data "sd:visualPlugins": - image: tab: "Raw" - Caption: "TSS enrichment vs ATAC UMI" + Caption: "TSS enrichment score vs fragments in peaks" raw_qc_mtrcs_dnst_plot_png: type: File? @@ -654,17 +654,17 @@ outputs: tab: "Raw" Caption: "Fragments length" - raw_rna_umi_dnst_spl_cnd_plot_png: + raw_umi_dnst_spl_cnd_plot_png: type: File? - outputSource: sc_multiome_filter/raw_rna_umi_dnst_spl_cnd_plot_png - label: "RNA UMI per cell, raw, split by condition" + outputSource: sc_multiome_filter/raw_umi_dnst_spl_cnd_plot_png + label: "Transcripts per cell, raw, split by condition" doc: | - Split by grouping condition RNA UMI - per cell for raw data + Split by grouping condition transcripts + per cell density for raw data "sd:visualPlugins": - image: tab: "Raw, by condition" - Caption: "RNA UMI per cell" + Caption: "Transcripts per cell" raw_gene_dnst_spl_cnd_plot_png: type: File? @@ -705,17 +705,17 @@ outputs: tab: "Raw, by condition" Caption: "Novelty score" - raw_atac_umi_dnst_spl_cnd_plot_png: + raw_frgm_dnst_spl_cnd_plot_png: type: File? - outputSource: sc_multiome_filter/raw_atac_umi_dnst_spl_cnd_plot_png - label: "ATAC UMI per cell, raw, split by condition" + outputSource: sc_multiome_filter/raw_frgm_dnst_spl_cnd_plot_png + label: "Fragments in peaks per cell, raw, split by condition" doc: | - Split by grouping condition ATAC - UMI per cell density for raw data + Split by grouping condition fragments + in peaks per cell density for raw data "sd:visualPlugins": - image: tab: "Raw, by condition" - Caption: "ATAC UMI per cell" + Caption: "Fragments in peaks per cell" raw_peak_dnst_spl_cnd_plot_png: type: File? @@ -779,17 +779,17 @@ outputs: tab: "Filtered" Caption: "Cells per dataset" - fltr_rna_umi_dnst_plot_png: + fltr_umi_dnst_plot_png: type: File? - outputSource: sc_multiome_filter/fltr_rna_umi_dnst_plot_png - label: "RNA UMI per cell, filtered" + outputSource: sc_multiome_filter/fltr_umi_dnst_plot_png + label: "Transcripts per cell, filtered" doc: | - RNA UMI per cell density + Transcripts per cell density for filtered data "sd:visualPlugins": - image: tab: "Filtered" - Caption: "RNA UMI per cell" + Caption: "Transcripts per cell" fltr_gene_dnst_plot_png: type: File? @@ -803,17 +803,17 @@ outputs: tab: "Filtered" Caption: "Genes per cell" - fltr_gene_umi_corr_plot_png: + fltr_gene_umi_plot_png: type: File? - outputSource: sc_multiome_filter/fltr_gene_umi_corr_plot_png - label: "Genes vs RNA UMI, filtered" + outputSource: sc_multiome_filter/fltr_gene_umi_plot_png + label: "Genes vs transcripts, filtered" doc: | - Genes vs RNA UMI per cell + Genes vs transcripts per cell for filtered data "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Genes vs RNA UMI" + Caption: "Genes vs transcripts" fltr_mito_dnst_plot_png: type: File? @@ -840,17 +840,17 @@ outputs: tab: "Filtered" Caption: "Novelty score" - fltr_atac_umi_dnst_plot_png: + fltr_frgm_dnst_plot_png: type: File? - outputSource: sc_multiome_filter/fltr_atac_umi_dnst_plot_png - label: "ATAC UMI per cell, filtered" + outputSource: sc_multiome_filter/fltr_frgm_dnst_plot_png + label: "Fragments in peaks per cell, filtered" doc: | - ATAC UMI per cell density + Fragments in peaks per cell density for filtered data "sd:visualPlugins": - image: tab: "Filtered" - Caption: "ATAC UMI per cell" + Caption: "Fragments in peaks per cell" fltr_peak_dnst_plot_png: type: File? @@ -877,29 +877,29 @@ outputs: tab: "Filtered" Caption: "Blacklist regions fraction" - fltr_rna_atac_umi_corr_plot_png: + fltr_rna_atac_cnts_plot_png: type: File? - outputSource: sc_multiome_filter/fltr_rna_atac_umi_corr_plot_png - label: "RNA UMI vs ATAC UMI, filtered" + outputSource: sc_multiome_filter/fltr_rna_atac_cnts_plot_png + label: "Transcripts vs fragments in peaks, filtered" doc: | - RNA UMI per cell vs ATAC UMI + Transcripts vs fragments in peaks per cell for filtered data "sd:visualPlugins": - image: tab: "Filtered" - Caption: "RNA UMI vs ATAC UMI" + Caption: "Transcripts vs fragments in peaks" - fltr_tss_atac_umi_corr_plot_png: + fltr_tss_frgm_plot_png: type: File? - outputSource: sc_multiome_filter/fltr_tss_atac_umi_corr_plot_png - label: "TSS enrichment vs ATAC UMI, filtered" + outputSource: sc_multiome_filter/fltr_tss_frgm_plot_png + label: "TSS enrichment score vs fragments in peaks, filtered" doc: | - TSS enrichment score vs ATAC UMI - per cell for filtered data + TSS enrichment score vs fragments in + peaks per cell for filtered data "sd:visualPlugins": - image: tab: "Filtered" - Caption: "TSS enrichment vs ATAC UMI" + Caption: "TSS enrichment score vs fragments in peaks" fltr_qc_mtrcs_dnst_plot_png: type: File? @@ -973,17 +973,17 @@ outputs: tab: "Filtered" Caption: "Fragments length" - fltr_rna_umi_dnst_spl_cnd_plot_png: + fltr_umi_dnst_spl_cnd_plot_png: type: File? - outputSource: sc_multiome_filter/fltr_rna_umi_dnst_spl_cnd_plot_png - label: "RNA UMI per cell, filtered, split by condition" + outputSource: sc_multiome_filter/fltr_umi_dnst_spl_cnd_plot_png + label: "Transcripts per cell, filtered, split by condition" doc: | - Split by grouping condition RNA UMI - per cell for filtered data + Split by grouping condition transcripts + per cell density for filtered data "sd:visualPlugins": - image: tab: "Filtered, by condition" - Caption: "RNA UMI per cell" + Caption: "Transcripts per cell" fltr_gene_dnst_spl_cnd_plot_png: type: File? @@ -1024,18 +1024,18 @@ outputs: tab: "Filtered, by condition" Caption: "Novelty score" - fltr_atac_umi_dnst_spl_cnd_plot_png: + fltr_frgm_dnst_spl_cnd_plot_png: type: File? - outputSource: sc_multiome_filter/fltr_atac_umi_dnst_spl_cnd_plot_png - label: "ATAC UMI per cell, filtered, split by condition" + outputSource: sc_multiome_filter/fltr_frgm_dnst_spl_cnd_plot_png + label: "Fragments in peaks per cell, filtered, split by condition" doc: | - Split by grouping condition ATAC - UMI per cell density for filtered + Split by grouping condition fragments + in peaks per cell density for filtered data "sd:visualPlugins": - image: tab: "Filtered, by condition" - Caption: "ATAC UMI per cell" + Caption: "Fragments in peaks per cell" fltr_peak_dnst_spl_cnd_plot_png: type: File? @@ -1137,8 +1137,8 @@ steps: maximum_genes: source: maximum_genes valueFrom: $(split_numbers(self)) - rna_minimum_umi: - source: rna_minimum_umi + minimum_umis: + source: minimum_umis valueFrom: $(split_numbers(self)) mito_pattern: mito_pattern maximum_mito_perc: maximum_mito_perc @@ -1147,8 +1147,8 @@ steps: valueFrom: $(split_numbers(self)) atac_minimum_cells: default: 1 # will remove peaks that are not present in any of the cells - atac_minimum_umi: - source: atac_minimum_umi + minimum_fragments: + source: minimum_fragments valueFrom: $(split_numbers(self)) maximum_nucl_signal: source: maximum_nucl_signal @@ -1194,53 +1194,53 @@ steps: - raw_1_2_qc_mtrcs_pca_plot_png - raw_2_3_qc_mtrcs_pca_plot_png - raw_cells_count_plot_png - - raw_rna_umi_dnst_plot_png + - raw_umi_dnst_plot_png - raw_gene_dnst_plot_png - - raw_gene_umi_corr_plot_png + - raw_gene_umi_plot_png - raw_mito_dnst_plot_png - raw_nvlt_dnst_plot_png - - raw_atac_umi_dnst_plot_png + - raw_frgm_dnst_plot_png - raw_peak_dnst_plot_png - raw_blck_dnst_plot_png - - raw_rna_atac_umi_corr_plot_png - - raw_tss_atac_umi_corr_plot_png + - raw_rna_atac_cnts_plot_png + - raw_tss_frgm_plot_png - raw_qc_mtrcs_dnst_plot_png - raw_rnadbl_plot_png - raw_atacdbl_plot_png - raw_vrlpdbl_plot_png - raw_tss_nrch_plot_png - raw_frgm_hist_png - - raw_rna_umi_dnst_spl_cnd_plot_png + - raw_umi_dnst_spl_cnd_plot_png - raw_gene_dnst_spl_cnd_plot_png - raw_mito_dnst_spl_cnd_plot_png - raw_nvlt_dnst_spl_cnd_plot_png - - raw_atac_umi_dnst_spl_cnd_plot_png + - raw_frgm_dnst_spl_cnd_plot_png - raw_peak_dnst_spl_cnd_plot_png - raw_blck_dnst_spl_cnd_plot_png - fltr_1_2_qc_mtrcs_pca_plot_png - fltr_2_3_qc_mtrcs_pca_plot_png - fltr_cells_count_plot_png - - fltr_rna_umi_dnst_plot_png + - fltr_umi_dnst_plot_png - fltr_gene_dnst_plot_png - - fltr_gene_umi_corr_plot_png + - fltr_gene_umi_plot_png - fltr_mito_dnst_plot_png - fltr_nvlt_dnst_plot_png - - fltr_atac_umi_dnst_plot_png + - fltr_frgm_dnst_plot_png - fltr_peak_dnst_plot_png - fltr_blck_dnst_plot_png - - fltr_rna_atac_umi_corr_plot_png + - fltr_rna_atac_cnts_plot_png - fltr_rnadbl_plot_png - fltr_atacdbl_plot_png - fltr_vrlpdbl_plot_png - - fltr_tss_atac_umi_corr_plot_png + - fltr_tss_frgm_plot_png - fltr_qc_mtrcs_dnst_plot_png - fltr_tss_nrch_plot_png - fltr_frgm_hist_png - - fltr_rna_umi_dnst_spl_cnd_plot_png + - fltr_umi_dnst_spl_cnd_plot_png - fltr_gene_dnst_spl_cnd_plot_png - fltr_mito_dnst_spl_cnd_plot_png - fltr_nvlt_dnst_spl_cnd_plot_png - - fltr_atac_umi_dnst_spl_cnd_plot_png + - fltr_frgm_dnst_spl_cnd_plot_png - fltr_peak_dnst_spl_cnd_plot_png - fltr_blck_dnst_spl_cnd_plot_png - ucsc_cb_html_data diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index 682b5c02..77086701 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -95,7 +95,7 @@ inputs: 'sd:layout': advanced: true - rna_minimum_umi: + minimum_umis: type: string? default: "500" label: "Include cells where at least this many UMI (transcripts) are detected" @@ -282,14 +282,14 @@ outputs: raw_umi_dnst_plot_png: type: File? outputSource: sc_rna_filter/raw_umi_dnst_plot_png - label: "UMI per cell density (not filtered)" + label: "Transcripts per cell density (not filtered)" doc: | - UMI per cell density (not filtered). + Transcripts per cell density (not filtered). PNG format 'sd:visualPlugins': - image: tab: 'Not filtered QC' - Caption: 'UMI per cell density' + Caption: 'Transcripts per cell density' raw_gene_dnst_plot_png: type: File? @@ -303,17 +303,17 @@ outputs: tab: 'Not filtered QC' Caption: 'Genes per cell density' - raw_gene_umi_corr_plot_png: + raw_gene_umi_plot_png: type: File? - outputSource: sc_rna_filter/raw_gene_umi_corr_plot_png - label: "Genes vs UMI per cell correlation (not filtered)" + outputSource: sc_rna_filter/raw_gene_umi_plot_png + label: "Genes vs transcripts per cell correlation (not filtered)" doc: | - Genes vs UMI per cell correlation (not filtered). + Genes vs transcripts per cell correlation (not filtered). PNG format 'sd:visualPlugins': - image: tab: 'Not filtered QC' - Caption: 'Genes vs UMI per cell correlation' + Caption: 'Genes vs transcripts per cell correlation' raw_mito_dnst_plot_png: type: File? @@ -366,14 +366,14 @@ outputs: raw_umi_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/raw_umi_dnst_spl_cnd_plot_png - label: "Split by grouping condition UMI per cell density (not filtered)" + label: "Split by grouping condition transcripts per cell density (not filtered)" doc: | - Split by grouping condition UMI per cell density (not filtered). + Split by grouping condition transcripts per cell density (not filtered). PNG format 'sd:visualPlugins': - image: tab: 'Not filtered QC' - Caption: 'Split by grouping condition UMI per cell density' + Caption: 'Split by grouping condition transcripts per cell density' raw_gene_dnst_spl_cnd_plot_png: type: File? @@ -451,14 +451,14 @@ outputs: fltr_umi_dnst_plot_png: type: File? outputSource: sc_rna_filter/fltr_umi_dnst_plot_png - label: "UMI per cell density (filtered)" + label: "Transcripts per cell density (filtered)" doc: | - UMI per cell density (filtered). + Transcripts per cell density (filtered). PNG format 'sd:visualPlugins': - image: tab: 'Filtered QC' - Caption: 'UMI per cell density' + Caption: 'Transcripts per cell density' fltr_gene_dnst_plot_png: type: File? @@ -472,17 +472,17 @@ outputs: tab: 'Filtered QC' Caption: 'Genes per cell density' - fltr_gene_umi_corr_plot_png: + fltr_gene_umi_plot_png: type: File? - outputSource: sc_rna_filter/fltr_gene_umi_corr_plot_png - label: "Genes vs UMI per cell correlation (filtered)" + outputSource: sc_rna_filter/fltr_gene_umi_plot_png + label: "Genes vs transcripts per cell correlation (filtered)" doc: | - Genes vs UMI per cell correlation (filtered). + Genes vs transcripts per cell correlation (filtered). PNG format 'sd:visualPlugins': - image: tab: 'Filtered QC' - Caption: 'Genes vs UMI per cell correlation' + Caption: 'Genes vs transcripts per cell correlation' fltr_mito_dnst_plot_png: type: File? @@ -535,14 +535,14 @@ outputs: fltr_umi_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/fltr_umi_dnst_spl_cnd_plot_png - label: "Split by grouping condition UMI per cell density (filtered)" + label: "Split by grouping condition transcripts per cell density (filtered)" doc: | - Split by grouping condition UMI per cell density (filtered). + Split by grouping condition transcripts per cell density (filtered). PNG format 'sd:visualPlugins': - image: tab: 'Filtered QC' - Caption: 'Split by grouping condition UMI per cell density' + Caption: 'Split by grouping condition transcripts per cell density' fltr_gene_dnst_spl_cnd_plot_png: type: File? @@ -649,8 +649,8 @@ steps: maximum_genes: source: maximum_genes valueFrom: $(split_numbers(self)) - rna_minimum_umi: - source: rna_minimum_umi + minimum_umis: + source: minimum_umis valueFrom: $(split_numbers(self)) minimum_novelty_score: source: minimum_novelty_score @@ -684,7 +684,7 @@ steps: - raw_cells_count_plot_png - raw_umi_dnst_plot_png - raw_gene_dnst_plot_png - - raw_gene_umi_corr_plot_png + - raw_gene_umi_plot_png - raw_mito_dnst_plot_png - raw_nvlt_dnst_plot_png - raw_qc_mtrcs_dnst_plot_png @@ -698,7 +698,7 @@ steps: - fltr_cells_count_plot_png - fltr_umi_dnst_plot_png - fltr_gene_dnst_plot_png - - fltr_gene_umi_corr_plot_png + - fltr_gene_umi_plot_png - fltr_mito_dnst_plot_png - fltr_nvlt_dnst_plot_png - fltr_qc_mtrcs_dnst_plot_png diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 35330d6d..629abfd3 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -415,14 +415,14 @@ outputs: umap_spl_umi_plot_png: type: File? outputSource: sc_rna_reduce/umap_spl_umi_plot_png - label: "Split by the UMI per cell counts cells UMAP" + label: "Split by the transcripts per cell counts cells UMAP" doc: | - Split by the UMI per cell counts cells UMAP. + Split by the transcripts per cell counts cells UMAP. PNG format 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by the UMI per cell counts cells UMAP' + Caption: 'Split by the transcripts per cell counts cells UMAP' umap_spl_gene_plot_png: type: File? @@ -511,14 +511,14 @@ outputs: umap_gr_cnd_spl_umi_plot_png: type: File? outputSource: sc_rna_reduce/umap_gr_cnd_spl_umi_plot_png - label: "Grouped by condition split by the UMI per cell counts cells UMAP" + label: "Grouped by condition split by the transcripts per cell counts cells UMAP" doc: | - Grouped by condition split by the UMI per cell counts cells UMAP. + Grouped by condition split by the transcripts per cell counts cells UMAP. PNG format 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'Grouped by condition split by the UMI per cell counts cells UMAP' + Caption: 'Grouped by condition split by the transcripts per cell counts cells UMAP' umap_gr_cnd_spl_gene_plot_png: type: File? From 7d8d09da4207f28724c0b735782daaedc0431533 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 30 Jun 2023 17:21:59 -0400 Subject: [PATCH 042/162] Optionally export cellbrowser data from dim. reduc. workflows --- workflows/sc-atac-cluster.cwl | 1 - workflows/sc-atac-reduce.cwl | 34 ++++++++++++++++++++++++++++++++-- workflows/sc-ctype-assign.cwl | 1 - workflows/sc-rna-cluster.cwl | 1 - workflows/sc-rna-da-cells.cwl | 1 - workflows/sc-rna-filter.cwl | 1 - workflows/sc-rna-reduce.cwl | 34 ++++++++++++++++++++++++++++++++-- workflows/sc-triangulate.cwl | 1 - workflows/sc-wnn-cluster.cwl | 1 - 9 files changed, 64 insertions(+), 11 deletions(-) diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index bcdb4c24..1983f8e2 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -445,7 +445,6 @@ steps: - cmp_gr_cnd_spl_clst_res_plot_png - cvrg_res_plot_png - peak_markers_tsv - - ucsc_cb_config_data - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index 3531e177..27da9073 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -215,6 +215,16 @@ inputs: 'sd:layout': advanced: true + export_ucsc_cb: + type: boolean? + default: false + label: "Show results in UCSC Cell Browser" + doc: | + Export results into UCSC Cell Browser + Default: false + 'sd:layout': + advanced: true + color_theme: type: - "null" @@ -416,6 +426,25 @@ outputs: tab: 'Per group' Caption: 'Split by grouping condition cells UMAP' + ucsc_cb_html_data: + type: Directory? + outputSource: sc_rna_reduce/ucsc_cb_html_data + label: "UCSC Cell Browser data" + doc: | + Directory with UCSC Cell Browser + data + + ucsc_cb_html_file: + type: File? + outputSource: sc_rna_reduce/ucsc_cb_html_file + label: "UCSC Cell Browser" + doc: | + UCSC Cell Browser HTML index file + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + seurat_data_rds: type: File outputSource: sc_atac_reduce/seurat_data_rds @@ -463,8 +492,7 @@ steps: umap_method: umap_method verbose: default: true - export_ucsc_cb: - default: false + export_ucsc_cb: export_ucsc_cb color_theme: color_theme parallel_memory_limit: source: parallel_memory_limit @@ -487,6 +515,8 @@ steps: - umap_spl_ncls_plot_png - umap_spl_frip_plot_png - umap_spl_blck_plot_png + - ucsc_cb_html_data + - ucsc_cb_html_file - seurat_data_rds - stdout_log - stderr_log diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index b1b60224..b1dfa427 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -750,7 +750,6 @@ steps: - xpr_htmp_plot_png - gene_markers_tsv - peak_markers_tsv - - ucsc_cb_config_data - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index 8f740c7e..a00be673 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -550,7 +550,6 @@ steps: - xpr_dnst_res_plot_png - xpr_htmp_res_plot_png - gene_markers_tsv - - ucsc_cb_config_data - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds diff --git a/workflows/sc-rna-da-cells.cwl b/workflows/sc-rna-da-cells.cwl index 9befb120..b4d3c910 100644 --- a/workflows/sc-rna-da-cells.cwl +++ b/workflows/sc-rna-da-cells.cwl @@ -399,7 +399,6 @@ steps: - umap_spl_idnt_rd_rnaumap_da_scr_plot_png - umap_spl_idnt_rd_atacumap_da_scr_plot_png - umap_spl_idnt_rd_wnnumap_da_scr_plot_png - - ucsc_cb_config_data - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index 77086701..05bdc417 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -707,7 +707,6 @@ steps: - fltr_gene_dnst_spl_cnd_plot_png - fltr_mito_dnst_spl_cnd_plot_png - fltr_nvlt_dnst_spl_cnd_plot_png - - ucsc_cb_config_data - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 629abfd3..10781627 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -260,6 +260,16 @@ inputs: 'sd:layout': advanced: true + export_ucsc_cb: + type: boolean? + default: false + label: "Show results in UCSC Cell Browser" + doc: | + Export results into UCSC Cell Browser + Default: false + 'sd:layout': + advanced: true + color_theme: type: - "null" @@ -532,6 +542,25 @@ outputs: tab: 'Per group' Caption: 'Grouped by condition split by the genes per cell counts cells UMAP' + ucsc_cb_html_data: + type: Directory? + outputSource: sc_rna_reduce/ucsc_cb_html_data + label: "UCSC Cell Browser data" + doc: | + Directory with UCSC Cell Browser + data + + ucsc_cb_html_file: + type: File? + outputSource: sc_rna_reduce/ucsc_cb_html_file + label: "UCSC Cell Browser" + doc: | + UCSC Cell Browser HTML index file + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + seurat_data_rds: type: File outputSource: sc_rna_reduce/seurat_data_rds @@ -590,8 +619,7 @@ steps: umap_method: umap_method verbose: default: true - export_ucsc_cb: - default: false + export_ucsc_cb: export_ucsc_cb low_memory: default: true color_theme: color_theme @@ -622,6 +650,8 @@ steps: - umap_gr_cnd_spl_mito_plot_png - umap_gr_cnd_spl_umi_plot_png - umap_gr_cnd_spl_gene_plot_png + - ucsc_cb_html_data + - ucsc_cb_html_file - seurat_data_rds - stdout_log - stderr_log diff --git a/workflows/sc-triangulate.cwl b/workflows/sc-triangulate.cwl index 38446f97..33166383 100644 --- a/workflows/sc-triangulate.cwl +++ b/workflows/sc-triangulate.cwl @@ -330,7 +330,6 @@ steps: - umap_tric_rd_rnaumap_plot_png - umap_tric_rd_atacumap_plot_png - umap_tric_rd_wnnumap_plot_png - - ucsc_cb_config_data - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index 1612bde6..8b70c209 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -707,7 +707,6 @@ steps: - xpr_htmp_res_plot_png - gene_markers_tsv - peak_markers_tsv - - ucsc_cb_config_data - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds From 41fc6226f612023631a1563d17c1db45f775ed79 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 7 Jul 2023 15:40:03 -0400 Subject: [PATCH 043/162] Refactor sc rna reduce workflow, update docker image to the latest --- tools/sc-atac-cluster.cwl | 2 +- tools/sc-atac-coverage.cwl | 2 +- tools/sc-atac-dbinding.cwl | 2 +- tools/sc-atac-reduce.cwl | 2 +- tools/sc-ctype-assign.cwl | 2 +- tools/sc-multiome-filter.cwl | 10 +- tools/sc-rna-cluster.cwl | 2 +- tools/sc-rna-da-cells.cwl | 2 +- tools/sc-rna-de-pseudobulk.cwl | 2 +- tools/sc-rna-filter.cwl | 10 +- tools/sc-rna-reduce.cwl | 92 +++-- tools/sc-triangulate.cwl | 2 +- tools/sc-wnn-cluster.cwl | 2 +- workflows/sc-multiome-filter.cwl | 9 + workflows/sc-rna-filter.cwl | 9 + workflows/sc-rna-reduce.cwl | 603 +++++++++++++++---------------- 16 files changed, 387 insertions(+), 366 deletions(-) diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index 292b2965..edc3df1c 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.23 + dockerPull: biowardrobe2/sc-tools:v0.0.24 inputs: diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index e4e4ea8a..9a1509c4 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.23 + dockerPull: biowardrobe2/sc-tools:v0.0.24 inputs: diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index b1941b2d..2a71381a 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.23 + dockerPull: biowardrobe2/sc-tools:v0.0.24 inputs: diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index 663d445e..d60921b9 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.23 + dockerPull: biowardrobe2/sc-tools:v0.0.24 inputs: diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index 717cf4fc..ca3f89a4 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.23 + dockerPull: biowardrobe2/sc-tools:v0.0.24 inputs: diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index e83b8945..b6449db5 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.23 + dockerPull: biowardrobe2/sc-tools:v0.0.24 inputs: @@ -1725,6 +1725,14 @@ outputs: doc: | Filtered Seurat data in RDS format + datasets_metadata: + type: File + outputBinding: + glob: "*_meta.tsv" + doc: | + Example of datasets metadata file + in TSV format + seurat_data_h5seurat: type: File? outputBinding: diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index d467f0a4..43604ee4 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.23 + dockerPull: biowardrobe2/sc-tools:v0.0.24 inputs: diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index 0521b0bc..1a91eaa9 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.23 + dockerPull: biowardrobe2/sc-tools:v0.0.24 inputs: diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index 2792f20e..8db7b4a7 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.23 + dockerPull: biowardrobe2/sc-tools:v0.0.24 inputs: diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index 5ecb6232..587514f2 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.23 + dockerPull: biowardrobe2/sc-tools:v0.0.24 inputs: @@ -741,6 +741,14 @@ outputs: doc: | Filtered Seurat data in RDS format + datasets_metadata: + type: File + outputBinding: + glob: "*_meta.tsv" + doc: | + Example of datasets metadata file + in TSV format + seurat_data_h5seurat: type: File? outputBinding: diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index 4afeda9c..bff3ac7c 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.23 + dockerPull: biowardrobe2/sc-tools:v0.0.24 inputs: @@ -391,7 +391,7 @@ outputs: outputBinding: glob: "*_elbow.png" doc: | - Elbow plot (from cells PCA). + Elbow plot. PNG format elbow_plot_pdf: @@ -399,7 +399,7 @@ outputs: outputBinding: glob: "*_elbow.pdf" doc: | - Elbow plot (from cells PCA). + Elbow plot. PDF format qc_dim_corr_plot_png: @@ -407,7 +407,7 @@ outputs: outputBinding: glob: "*_qc_dim_corr.png" doc: | - Correlation plots between QC metrics and cells PCA components. + Correlation between QC metrics and principal components. PNG format qc_dim_corr_plot_pdf: @@ -415,7 +415,7 @@ outputs: outputBinding: glob: "*_qc_dim_corr.pdf" doc: | - Correlation plots between QC metrics and cells PCA components. + Correlation between QC metrics and principal components. PDF format umap_qc_mtrcs_plot_png: @@ -423,7 +423,7 @@ outputs: outputBinding: glob: "*_umap_qc_mtrcs.png" doc: | - QC metrics on cells UMAP. + UMAP, QC metrics. PNG format umap_qc_mtrcs_plot_pdf: @@ -431,7 +431,7 @@ outputs: outputBinding: glob: "*_umap_qc_mtrcs.pdf" doc: | - QC metrics on cells UMAP. + UMAP, QC metrics. PDF format umap_plot_png: @@ -439,7 +439,7 @@ outputs: outputBinding: glob: "*_umap.png" doc: | - Cells UMAP. + UMAP, colored by dataset. PNG format umap_plot_pdf: @@ -447,7 +447,7 @@ outputs: outputBinding: glob: "*_umap.pdf" doc: | - Cells UMAP. + UMAP, colored by dataset. PDF format umap_spl_ph_plot_png: @@ -455,7 +455,8 @@ outputs: outputBinding: glob: "*_umap_spl_ph.png" doc: | - Split by cell cycle phase cells UMAP. + UMAP, colored by dataset, split by + cell cycle phase. PNG format umap_spl_ph_plot_pdf: @@ -463,7 +464,8 @@ outputs: outputBinding: glob: "*_umap_spl_ph.pdf" doc: | - Split by cell cycle phase cells UMAP. + UMAP, colored by dataset, split by + cell cycle phase. PDF format ccpca_plot_png: @@ -471,7 +473,7 @@ outputs: outputBinding: glob: "*_ccpca.png" doc: | - Cells PCA using only cell cycle genes. + PCA, colored by cell cycle phase. PNG format ccpca_plot_pdf: @@ -479,7 +481,7 @@ outputs: outputBinding: glob: "*_ccpca.pdf" doc: | - Cells PCA using only cell cycle genes. + PCA, colored by cell cycle phase. PDF format umap_spl_mito_plot_png: @@ -487,7 +489,8 @@ outputs: outputBinding: glob: "*_umap_spl_mito.png" doc: | - Split by the percentage of transcripts mapped to mitochondrial genes cells UMAP. + UMAP, colored by dataset, split by + mitochondrial percentage. PNG format umap_spl_mito_plot_pdf: @@ -495,7 +498,8 @@ outputs: outputBinding: glob: "*_umap_spl_mito.pdf" doc: | - Split by the percentage of transcripts mapped to mitochondrial genes cells UMAP. + UMAP, colored by dataset, split by + mitochondrial percentage. PDF format umap_spl_umi_plot_png: @@ -503,7 +507,8 @@ outputs: outputBinding: glob: "*_umap_spl_umi.png" doc: | - Split by the transcripts per cell counts cells UMAP. + UMAP, colored by dataset, split by + transcripts per cell. PNG format umap_spl_umi_plot_pdf: @@ -511,7 +516,8 @@ outputs: outputBinding: glob: "*_umap_spl_umi.pdf" doc: | - Split by the transcripts per cell counts cells UMAP. + UMAP, colored by dataset, split by + transcripts per cell. PDF format umap_spl_gene_plot_png: @@ -519,7 +525,8 @@ outputs: outputBinding: glob: "*_umap_spl_gene.png" doc: | - Split by the genes per cell counts cells UMAP. + UMAP, colored by dataset, split by + genes per cell. PNG format umap_spl_gene_plot_pdf: @@ -527,7 +534,8 @@ outputs: outputBinding: glob: "*_umap_spl_gene.pdf" doc: | - Split by the genes per cell counts cells UMAP. + UMAP, colored by dataset, split by + genes per cell. PDF format umap_spl_idnt_plot_png: @@ -535,7 +543,7 @@ outputs: outputBinding: glob: "*_umap_spl_idnt.png" doc: | - Split by dataset cells UMAP. + UMAP, split by dataset. PNG format umap_spl_idnt_plot_pdf: @@ -543,7 +551,7 @@ outputs: outputBinding: glob: "*_umap_spl_idnt.pdf" doc: | - Split by dataset cells UMAP. + UMAP, split by dataset. PDF format ccpca_spl_idnt_plot_png: @@ -551,7 +559,8 @@ outputs: outputBinding: glob: "*_ccpca_spl_idnt.png" doc: | - Split by dataset cells PCA using only cell cycle genes. + PCA, colored by cell cycle phase, + split by dataset. PNG format ccpca_spl_idnt_plot_pdf: @@ -559,7 +568,8 @@ outputs: outputBinding: glob: "*_ccpca_spl_idnt.pdf" doc: | - Split by dataset cells PCA using only cell cycle genes. + PCA, colored by cell cycle phase, + split by dataset. PDF format umap_spl_cnd_plot_png: @@ -567,7 +577,8 @@ outputs: outputBinding: glob: "*_umap_spl_cnd.png" doc: | - Split by grouping condition cells UMAP. + UMAP, colored by dataset, split by + grouping condition. PNG format umap_spl_cnd_plot_pdf: @@ -575,7 +586,8 @@ outputs: outputBinding: glob: "*_umap_spl_cnd.pdf" doc: | - Split by grouping condition cells UMAP. + UMAP, colored by dataset, split by + grouping condition. PDF format umap_gr_cnd_spl_ph_plot_png: @@ -583,7 +595,8 @@ outputs: outputBinding: glob: "*_umap_gr_cnd_spl_ph.png" doc: | - Grouped by condition split by cell cycle cells UMAP. + UMAP, colored by grouping condition, + split by cell cycle phase. PNG format umap_gr_cnd_spl_ph_plot_pdf: @@ -591,7 +604,8 @@ outputs: outputBinding: glob: "*_umap_gr_cnd_spl_ph.pdf" doc: | - Grouped by condition split by cell cycle cells UMAP. + UMAP, colored by grouping condition, + split by cell cycle phase. PDF format ccpca_spl_cnd_plot_png: @@ -599,7 +613,8 @@ outputs: outputBinding: glob: "*_ccpca_spl_cnd.png" doc: | - Split by grouping condition cells PCA using only cell cycle genes. + PCA, colored by cell cycle phase, + split by grouping condition. PNG format ccpca_spl_cnd_plot_pdf: @@ -607,7 +622,8 @@ outputs: outputBinding: glob: "*_ccpca_spl_cnd.pdf" doc: | - Split by grouping condition cells PCA using only cell cycle genes. + PCA, colored by cell cycle phase, + split by grouping condition. PDF format umap_gr_cnd_spl_mito_plot_png: @@ -615,7 +631,8 @@ outputs: outputBinding: glob: "*_umap_gr_cnd_spl_mito.png" doc: | - Grouped by condition split by the percentage of transcripts mapped to mitochondrial genes cells UMAP. + UMAP, colored by grouping condition, + split by mitochondrial percentage. PNG format umap_gr_cnd_spl_mito_plot_pdf: @@ -623,7 +640,8 @@ outputs: outputBinding: glob: "*_umap_gr_cnd_spl_mito.pdf" doc: | - Grouped by condition split by the percentage of transcripts mapped to mitochondrial genes cells UMAP. + UMAP, colored by grouping condition, + split by mitochondrial percentage. PDF format umap_gr_cnd_spl_umi_plot_png: @@ -631,7 +649,8 @@ outputs: outputBinding: glob: "*_umap_gr_cnd_spl_umi.png" doc: | - Grouped by condition split by the transcripts per cell counts cells UMAP. + UMAP, colored by grouping condition, + split by transcripts per cell. PNG format umap_gr_cnd_spl_umi_plot_pdf: @@ -639,7 +658,8 @@ outputs: outputBinding: glob: "*_umap_gr_cnd_spl_umi.pdf" doc: | - Grouped by condition split by the transcripts per cell counts cells UMAP. + UMAP, colored by grouping condition, + split by transcripts per cell. PDF format umap_gr_cnd_spl_gene_plot_png: @@ -647,7 +667,8 @@ outputs: outputBinding: glob: "*_umap_gr_cnd_spl_gene.png" doc: | - Grouped by condition split by the genes per cell counts cells UMAP. + UMAP, colored by grouping condition, + split by genes per cell. PNG format umap_gr_cnd_spl_gene_plot_pdf: @@ -655,7 +676,8 @@ outputs: outputBinding: glob: "*_umap_gr_cnd_spl_gene.pdf" doc: | - Grouped by condition split by the genes per cell counts cells UMAP. + UMAP, colored by grouping condition, + split by genes per cell. PDF format ucsc_cb_config_data: diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index 1c83645b..31c3c407 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.23 + dockerPull: biowardrobe2/sc-tools:v0.0.24 inputs: diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index d9a0d4a8..98c51ae1 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.23 + dockerPull: biowardrobe2/sc-tools:v0.0.24 inputs: diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index 1861266b..a80bcaee 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -1089,6 +1089,14 @@ outputs: doc: | Processed seurat data in RDS format + datasets_metadata: + type: File + outputSource: sc_multiome_filter/datasets_metadata + label: "Example of datasets metadata" + doc: | + Example of datasets metadata file + in TSV format + sc_multiome_filter_stdout_log: type: File outputSource: sc_multiome_filter/stdout_log @@ -1246,6 +1254,7 @@ steps: - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds + - datasets_metadata - stdout_log - stderr_log diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index 05bdc417..c088145e 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -606,6 +606,14 @@ outputs: doc: | Processed Seurat data in RDS format + datasets_metadata: + type: File + outputSource: sc_rna_filter/datasets_metadata + label: "Example of datasets metadata" + doc: | + Example of datasets metadata file + in TSV format + sc_rna_filter_stdout_log: type: File outputSource: sc_rna_filter/stdout_log @@ -710,6 +718,7 @@ steps: - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds + - datasets_metadata - stdout_log - stderr_log diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 10781627..4c45d4c8 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -29,134 +29,199 @@ inputs: alias: type: string - label: "Experiment short name/alias" + label: "Analysis name" sd:preview: position: 1 query_data_rds: type: File - label: "Experiment run through either Single-cell RNA-Seq or Multiome ATAC and RNA-Seq Filtering Analysis" + label: "Single-cell Analysis with Filtered RNA-Seq Datasets" doc: | - Path to the RDS file to load Seurat object from. This file should include genes - expression information stored in the RNA assay. + Any analysis that includes single-cell + multiome ATAC and RNA-Seq or just + RNA-Seq datasets filtered by QC metrics + to include only high-quality cells. 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" 'sd:localLabel': true - cell_cycle_data: + normalization_method: type: - "null" - type: enum symbols: - - "hg19" - - "hg38" - - "mm10" - label: "Genome type for cell cycle genes selection" + - "sct" + - "sctglm" + - "log" + label: "Normalization method" + default: "sctglm" doc: | - Genome type to use for cell cycle score - assignment. If not provided, cell cycle - scores won't be assigned. + Normalization and variance stabilization + method to remove technical variability + between the cells. "sct" - use sctransform + package described in Hafemeister and Satija, + Genome Biology 2019. "sctglm" - use updated + sctransform package described in Choudhary + and Satija, Genome Biology, 2022. "log" - + use a combination of NormalizeData and + ScaleData functions described in Stuart and + Butler, Cell 2019. + Default: sctglm - datasets_metadata: - type: File? - label: "Path to the TSV/CSV file to optionally extend Seurat object metadata with categorical values" + integration_method: + type: + - "null" + - type: enum + symbols: + - "seurat" + - "harmony" + - "none" + label: "Integration method" + default: "seurat" doc: | - Path to the TSV/CSV file to optionally extend Seurat object metadata with - categorical values using samples identities. First column - 'library_id' - should correspond to all unique values from the 'new.ident' column of the - loaded Seurat object. If any of the provided in this file columns are already - present in the Seurat object metadata, they will be overwritten. When combined - with --barcodes parameter, first the metadata will be extended, then barcode - filtering will be applied. - Default: no extra metadata is added + Integration method to match shared cell + types and states across experimental + batches, donors, conditions, or datasets. + "seurat" - use cross-dataset pairs of + cells that are in a matched biological + state ("anchors") to correct for technical + differences. "harmony" - use Harmony + algorithm described in Korsunsky, Millard, + and Fan, Nat Methods, 2019, to iteratively + correct PCA embeddings. "none" - do not + run integration, merge datasets instead. + Default: seurat - barcodes_data: - type: File? - label: "Optional TSV/CSV file to prefilter and extend metadata be barcodes. First column should be named as 'barcode'" - doc: | - Path to the TSV/CSV file to optionally prefilter and - extend Seurat object metadata be selected barcodes. - First column should be named as 'barcode'. If file - includes any other columns they will be added to the - Seurat object metadata ovewriting the existing ones if - those are present. - Default: all cells used, no extra metadata is added + integrate_by: + type: + - "null" + - string + - type: enum + symbols: + - "dataset" + - "condition" + label: "Batch correction (harmony)" + default: "dataset" + doc: | + When "harmony" is selected as "Integration + method", batch effects are corrected based + on the provided factors. Specifically, + "dataset" is used to integrate out the + influence of the cells' dataset of origin, + while the factor "condition" is used to + eliminate the influence of dataset grouping. + Default: dataset dimensions: type: int? - label: "Dimensionality to use in UMAP projection (from 1 to 50)" + label: "Target dimensionality" default: 40 doc: | - Dimensionality to use in UMAP projection (from 1 to 50). If single value N - is provided, use from 1 to N PCs. If multiple values are provided, subset to - only selected PCs. In combination with --ntgr set to harmony, selected principle - components will be used in Harmony integration. - Default: from 1 to 10 + Number of princinpal components to be used + in PCA and UMAP projection. Accepted values + range from 1 to 50. + Default: 40 - normalization_method: + cell_cycle_data: type: - "null" - type: enum symbols: - - "sct" - - "log" - - "sctglm" - label: "Normalization method applied to genes expression counts" - default: "sctglm" + - "human" + - "mouse" + - "none" + label: "Cell cycle gene set" + default: "none" doc: | - Normalization method applied to genes expression counts. If loaded Seurat object - includes multiple datasets, normalization will be run independently for each of - them, unless integration is disabled with 'none' or set to 'harmony' - Default: sct - 'sd:layout': - advanced: true + Assign cell cycle score and + phase based on the gene set + for the selected organism. + When selected "none", skip + cell cycle score assignment. + Default: "none" - integration_method: + regress_cellcycle: type: - "null" - type: enum symbols: - - "seurat" - - "harmony" - - "none" - label: "Integration method used for joint analysis of multiple datasets" - default: "seurat" - doc: | - Integration method used for joint analysis of multiple datasets. Automatically - set to 'none' if loaded Seurat object includes only one dataset. - Default: seurat - 'sd:layout': - advanced: true + - "completely" + - "partialy" + - "do not remove" + label: "Remove cell cycle" + default: "do not remove" + doc: | + Remove the influence cell cycle + phase on the dimensionality + reduction results. When selected + "completely", regress all signals + associated with the cell cycle phase. + For "partialy" - regress only the + differences in cell cycle phase + among proliferating cells, signals + separating non-cycling and cycling + cells will be maintained. When + selected "do not remove" - do not + regress signals associated with the + cell cycle phase. Ignored if cell + cycle gene set is not provided. + Default: "do not remove" - integrate_by: - type: string? - label: "Variable(s) to be integrated out when running multiple integration with Harmony" - default: "new.ident" - doc: | - Column(s) from the Seurat object metadata to define the variable(s) that should - be integrated out when running multiple datasets integration with harmony. May - include columns from the extra metadata added with --metadata parameter. Ignored - if --ntgr is not set to harmony. - Default: new.ident - 'sd:layout': - advanced: true + datasets_metadata: + type: File? + label: "Datasets metadata (optional)" + doc: | + If the selected single-cell analysis + includes multiple aggregated datasets, + each of them can be assigned to a + separate group by one or multiple + categories. This can be achieved by + providing a TSV/CSV file with + "library_id" as the first column and + any number of additional columns with + unique names, representing the desired + grouping categories. To obtain a proper + template of this file, download + "datasets_metadata.tsv" output from the + "Files" tab of the selected "Single-cell + Analysis with Filtered RNA-Seq Datasets" + and add extra columns as needed. + + barcodes_data: + type: File? + label: "Selected cell barcodes (optional)" + doc: | + A TSV/CSV file to optionally prefilter + the single cell data by including only + the cells with the selected barcodes. + The provided file should include at + least one column named "barcode", with + one cell barcode per line. All other + columns, except for "barcode", will be + added to the single cell metadata loaded + from "Single-cell Analysis with Filtered + RNA-Seq Datasets" and can be utilized in + the current or future steps of analysis. highly_var_genes_count: type: int? - label: "Number of highly variable genes used in datasets integration, scaling and dimensionality reduction" + label: "Number of highly variable genes" default: 3000 doc: | - Number of highly variable genes used in datasets integration, scaling and - dimensionality reduction. + The number of highly variable genes + to be used in gene expression scaling, + datasets integration, and dimensionality + reduction. Default: 3000 'sd:layout': advanced: true regress_mito_perc: type: boolean? - label: "Regress the percentage of transcripts mapped to mitochondrial genes as a confounding source of variation" + label: "Regress mitochondrial percentage" default: false doc: | - Regress the percentage of transcripts mapped to mitochondrial genes as a + Regress the percentage of transcripts + mapped to mitochondrial genes as a confounding source of variation. Default: false 'sd:layout': @@ -164,102 +229,15 @@ inputs: regress_genes: type: string? - label: "Regress genes per cell counts as a confounding source of variation" + label: "Regress genes" default: null doc: | - Genes which expression should be regressed as a confounding source of variation. + Regress expression of the selected genes + as a confounding source of variation. Default: None 'sd:layout': advanced: true - regress_cellcycle: - type: - - "null" - - type: enum - symbols: - - "completely" - - "partialy" - - "none" - label: "Regress cell cycle scores as a confounding source of variation" - default: "none" - doc: | - "completely" - regress all signals associated with cell cycle phase. - "partialy" - regress only differences in cell cycle phase among - proliferating cells, signals separating non-cycling and cycling cells - will be maintained. - "none" - do not regress signals associated with cell cycle phase - Default: "none" - 'sd:layout': - advanced: true - - umap_spread: - type: float? - label: "UMAP Spread - the effective scale of embedded points (determines how clustered/clumped the embedded points are)" - default: 1 - doc: | - The effective scale of embedded points on UMAP. In combination with '--mindist' - it determines how clustered/clumped the embedded points are. - Default: 1 - 'sd:layout': - advanced: true - - umap_mindist: - type: float? - label: "UMAP Min. Dist. - controls how tightly the embedding is allowed compress points together" - default: 0.3 - doc: | - Controls how tightly the embedding is allowed compress points together on UMAP. - Larger values ensure embedded points are moreevenly distributed, while smaller - values allow the algorithm to optimise more accurately with regard to local structure. - Sensible values are in the range 0.001 to 0.5. - Default: 0.3 - 'sd:layout': - advanced: true - - umap_neighbors: - type: int? - label: "UMAP Neighbors Number - determines the number of neighboring points used" - default: 30 - doc: | - Determines the number of neighboring points used in UMAP. Larger values will result - in more global structure being preserved at the loss of detailed local structure. - In general this parameter should often be in the range 5 to 50. - Default: 30 - 'sd:layout': - advanced: true - - umap_metric: - type: - - "null" - - type: enum - symbols: - - "euclidean" - - "cosine" - - "correlation" - label: "UMAP Dist. Metric - the metric to use to compute distances in high dimensional space" - default: "cosine" - doc: | - The metric to use to compute distances in high dimensional space for UMAP. - Default: cosine - 'sd:layout': - advanced: true - - umap_method: - type: - - "null" - - type: enum - symbols: - - "uwot" - - "uwot-learn" - - "umap-learn" - label: "UMAP implementation to run (if set to 'umap-learn' use 'correlation' distance metric)" - default: "uwot" - doc: | - UMAP implementation to run. If set to 'umap-learn' use --umetric 'correlation' - Default: uwot - 'sd:layout': - advanced: true - export_ucsc_cb: type: boolean? default: false @@ -284,41 +262,12 @@ inputs: - "classic" - "void" default: "classic" - label: "Color theme for all generated plots" + label: "Plots color theme" doc: | - Color theme for all generated plots. One of gray, bw, linedraw, light, - dark, minimal, classic, void. + Color theme for all plots saved + as PNG files. Default: classic - 'sd:layout': - advanced: true - - parallel_memory_limit: - type: - - "null" - - type: enum - symbols: - - "32" - default: "32" - label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" - doc: | - Maximum memory in GB allowed to be shared between the workers - when using multiple --cpus. - Forced to 32 GB - 'sd:layout': - advanced: true - - vector_memory_limit: - type: - - "null" - - type: enum - symbols: - - "96" - default: "96" - label: "Maximum vector memory in GB allowed to be used by R" - doc: | - Maximum vector memory in GB allowed to be used by R. - Forced to 96 GB - 'sd:layout': + "sd:layout": advanced: true threads: @@ -329,10 +278,13 @@ inputs: - "1" - "2" default: "1" - label: "Number of cores/cpus to use" + label: "Cores/CPUs number" doc: | - Number of cores/cpus to use - 'sd:layout': + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 1 + "sd:layout": advanced: true @@ -341,206 +293,203 @@ outputs: elbow_plot_png: type: File? outputSource: sc_rna_reduce/elbow_plot_png - label: "Elbow plot (from cells PCA)" + label: "Elbow plot" doc: | - Elbow plot (from cells PCA). - PNG format + Elbow plot to evaluate the number of + principal components that capture the + majority of the variation in the data. 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Elbow plot (from cells PCA)' + tab: 'QC' + Caption: 'Elbow plot' qc_dim_corr_plot_png: type: File? outputSource: sc_rna_reduce/qc_dim_corr_plot_png - label: "Correlation plots between QC metrics and cells PCA components" + label: "Correlation between QC metrics and principal components" doc: | - Correlation plots between QC metrics and cells PCA components. - PNG format + Correlation between QC metrics and + principal components 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Correlation plots between QC metrics and cells PCA components' + tab: 'QC' + Caption: 'Correlation between QC metrics and principal components' umap_qc_mtrcs_plot_png: type: File? outputSource: sc_rna_reduce/umap_qc_mtrcs_plot_png - label: "QC metrics on cells UMAP" + label: "UMAP, QC metrics" doc: | - QC metrics on cells UMAP. - PNG format + UMAP, QC metrics 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'QC metrics on cells UMAP' - - umap_plot_png: - type: File? - outputSource: sc_rna_reduce/umap_plot_png - label: "Cells UMAP" - doc: | - Cells UMAP. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Overall' - Caption: 'Cells UMAP' + tab: 'QC' + Caption: 'UMAP, QC metrics' ccpca_plot_png: type: File? outputSource: sc_rna_reduce/ccpca_plot_png - label: "Cells PCA using only cell cycle genes" + label: "PCA, colored by cell cycle phase" doc: | - Cells PCA using only cell cycle genes. - PNG format + PCA, colored by cell cycle phase 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Cells PCA using only cell cycle genes' + tab: 'QC' + Caption: 'PCA, colored by cell cycle phase' - umap_spl_ph_plot_png: + umap_plot_png: type: File? - outputSource: sc_rna_reduce/umap_spl_ph_plot_png - label: "Split by cell cycle phase cells UMAP" + outputSource: sc_rna_reduce/umap_plot_png + label: "UMAP, colored by dataset" doc: | - Split by cell cycle phase cells UMAP. - PNG format + UMAP, colored by dataset 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by cell cycle phase cells UMAP' + Caption: 'UMAP, colored by dataset' - umap_spl_mito_plot_png: + umap_spl_idnt_plot_png: type: File? - outputSource: sc_rna_reduce/umap_spl_mito_plot_png - label: "Split by the percentage of transcripts mapped to mitochondrial genes cells UMAP" + outputSource: sc_rna_reduce/umap_spl_idnt_plot_png + label: "UMAP, split by dataset" doc: | - Split by the percentage of transcripts mapped to mitochondrial genes cells UMAP. - PNG format + UMAP, split by dataset 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by the percentage of transcripts mapped to mitochondrial genes cells UMAP' + Caption: 'UMAP, split by dataset' umap_spl_umi_plot_png: type: File? outputSource: sc_rna_reduce/umap_spl_umi_plot_png - label: "Split by the transcripts per cell counts cells UMAP" + label: "UMAP, colored by dataset, split by transcripts per cell" doc: | - Split by the transcripts per cell counts cells UMAP. - PNG format + UMAP, colored by dataset, split by + transcripts per cell 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by the transcripts per cell counts cells UMAP' + Caption: 'UMAP, colored by dataset, split by transcripts per cell' umap_spl_gene_plot_png: type: File? outputSource: sc_rna_reduce/umap_spl_gene_plot_png - label: "Split by the genes per cell counts cells UMAP" + label: "UMAP, colored by dataset, split by genes per cell" doc: | - Split by the genes per cell counts cells UMAP. - PNG format + UMAP, colored by dataset, split by + genes per cell 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by the genes per cell counts cells UMAP' + Caption: 'UMAP, colored by dataset, split by genes per cell' - umap_spl_idnt_plot_png: + umap_spl_mito_plot_png: type: File? - outputSource: sc_rna_reduce/umap_spl_idnt_plot_png - label: "Split by dataset cells UMAP" + outputSource: sc_rna_reduce/umap_spl_mito_plot_png + label: "UMAP, colored by dataset, split by mitochondrial percentage" + doc: | + UMAP, colored by dataset, split by + mitochondrial percentage + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'UMAP, colored by dataset, split by mitochondrial percentage' + + umap_spl_ph_plot_png: + type: File? + outputSource: sc_rna_reduce/umap_spl_ph_plot_png + label: "UMAP, colored by dataset, split by cell cycle phase" doc: | - Split by dataset cells UMAP. - PNG format + UMAP, colored by dataset, split by + cell cycle phase 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by dataset cells UMAP' + Caption: 'UMAP, colored by dataset, split by cell cycle phase' ccpca_spl_idnt_plot_png: type: File? outputSource: sc_rna_reduce/ccpca_spl_idnt_plot_png - label: "Split by dataset cells PCA using only cell cycle genes" + label: "PCA, colored by cell cycle phase, split by dataset" doc: | - Split by dataset cells PCA using only cell cycle genes. - PNG format + PCA, colored by cell cycle phase, + split by dataset 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by dataset cells PCA using only cell cycle genes' + Caption: 'PCA, colored by cell cycle phase, split by dataset' umap_spl_cnd_plot_png: type: File? outputSource: sc_rna_reduce/umap_spl_cnd_plot_png - label: "Split by grouping condition cells UMAP" + label: "UMAP, colored by dataset, split by grouping condition" doc: | - Split by grouping condition cells UMAP. - PNG format + UMAP, colored by dataset, split by + grouping condition 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'Split by grouping condition cells UMAP' + Caption: 'UMAP, colored by dataset, split by grouping condition' - umap_gr_cnd_spl_ph_plot_png: + umap_gr_cnd_spl_umi_plot_png: type: File? - outputSource: sc_rna_reduce/umap_gr_cnd_spl_ph_plot_png - label: "Grouped by condition split by cell cycle cells UMAP" + outputSource: sc_rna_reduce/umap_gr_cnd_spl_umi_plot_png + label: "UMAP, colored by grouping condition, split by transcripts per cell" doc: | - Grouped by condition split by cell cycle cells UMAP. - PNG format + UMAP, colored by grouping condition, + split by transcripts per cell 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'Grouped by condition split by cell cycle cells UMAP' + Caption: 'UMAP, colored by grouping condition, split by transcripts per cell' - ccpca_spl_cnd_plot_png: + umap_gr_cnd_spl_gene_plot_png: type: File? - outputSource: sc_rna_reduce/ccpca_spl_cnd_plot_png - label: "Split by grouping condition cells PCA using only cell cycle genes" + outputSource: sc_rna_reduce/umap_gr_cnd_spl_gene_plot_png + label: "UMAP, colored by grouping condition, split by genes per cell" doc: | - Split by grouping condition cells PCA using only cell cycle genes. - PNG format + UMAP, colored by grouping condition, + split by genes per cell 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'Split by grouping condition cells PCA using only cell cycle genes' + Caption: 'UMAP, colored by grouping condition, split by genes per cell' umap_gr_cnd_spl_mito_plot_png: type: File? outputSource: sc_rna_reduce/umap_gr_cnd_spl_mito_plot_png - label: "Grouped by condition split by the percentage of transcripts mapped to mitochondrial genes cells UMAP" + label: "UMAP, colored by grouping condition, split by mitochondrial percentage" doc: | - Grouped by condition split by the percentage of transcripts mapped to mitochondrial genes cells UMAP. - PNG format + UMAP, colored by grouping condition, + split by mitochondrial percentage 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'Grouped by condition split by the percentage of transcripts mapped to mitochondrial genes cells UMAP' + Caption: 'UMAP, colored by grouping condition, split by mitochondrial percentage' - umap_gr_cnd_spl_umi_plot_png: + umap_gr_cnd_spl_ph_plot_png: type: File? - outputSource: sc_rna_reduce/umap_gr_cnd_spl_umi_plot_png - label: "Grouped by condition split by the transcripts per cell counts cells UMAP" + outputSource: sc_rna_reduce/umap_gr_cnd_spl_ph_plot_png + label: "UMAP, colored by grouping condition, split by cell cycle phase" doc: | - Grouped by condition split by the transcripts per cell counts cells UMAP. - PNG format + UMAP, colored by grouping condition, + split by cell cycle phase 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'Grouped by condition split by the transcripts per cell counts cells UMAP' + Caption: 'UMAP, colored by grouping condition, split by cell cycle phase' - umap_gr_cnd_spl_gene_plot_png: + ccpca_spl_cnd_plot_png: type: File? - outputSource: sc_rna_reduce/umap_gr_cnd_spl_gene_plot_png - label: "Grouped by condition split by the genes per cell counts cells UMAP" + outputSource: sc_rna_reduce/ccpca_spl_cnd_plot_png + label: "PCA, colored by cell cycle phase, split by grouping condition" doc: | - Grouped by condition split by the genes per cell counts cells UMAP. - PNG format + PCA, colored by cell cycle phase, + split by grouping condition 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'Grouped by condition split by the genes per cell counts cells UMAP' + Caption: 'PCA, colored by cell cycle phase, split by grouping condition' ucsc_cb_html_data: type: Directory? @@ -593,30 +542,47 @@ steps: in: query_data_rds: query_data_rds barcodes_data: barcodes_data - cell_cycle_data: cell_cycle_data + cell_cycle_data: + source: cell_cycle_data + valueFrom: | + ${ + if (self.includes("human")) { + return "hg38"; + } else if (self.includes("mouse")) { + return "mm10"; + } else { + return null; + } + } + regress_ccycle_full: + source: regress_cellcycle + valueFrom: $(self.includes("completely")?true:null) + regress_ccycle_diff: + source: regress_cellcycle + valueFrom: $(self.includes("partialy")?true:null) datasets_metadata: datasets_metadata normalization_method: normalization_method integration_method: integration_method integrate_by: source: integrate_by - valueFrom: $(split_features(self)) + valueFrom: | + ${ + if (self == "none") { + return null; + } else if (self == "dataset") { + return "new.ident"; + } else if (self == "condition") { + return "condition"; + } else { + return split_features(self); + } + } highly_var_genes_count: highly_var_genes_count regress_mito_perc: regress_mito_perc regress_genes: source: regress_genes valueFrom: $(split_features(self)) - regress_ccycle_full: - source: regress_cellcycle - valueFrom: $(self=="completely"?true:null) - regress_ccycle_diff: - source: regress_cellcycle - valueFrom: $(self=="partialy"?true:null) dimensions: dimensions - umap_spread: umap_spread - umap_mindist: umap_mindist - umap_neighbors: umap_neighbors - umap_metric: umap_metric - umap_method: umap_method verbose: default: true export_ucsc_cb: export_ucsc_cb @@ -624,11 +590,9 @@ steps: default: true color_theme: color_theme parallel_memory_limit: - source: parallel_memory_limit - valueFrom: $(parseInt(self)) + default: 32 vector_memory_limit: - source: vector_memory_limit - valueFrom: $(parseInt(self)) + default: 96 threads: source: threads valueFrom: $(parseInt(self)) @@ -705,4 +669,5 @@ s:creator: doc: | Single-cell RNA-Seq Dimensionality Reduction Analysis - Integrates multiple single-cell RNA-Seq datasets, reduces dimensionality using PCA. \ No newline at end of file + Integrates multiple single-cell RNA-Seq datasets, + reduces dimensionality using PCA. \ No newline at end of file From fb5d10b1cfab6e89a4ff90f345f71fb4a120a3ef Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 7 Jul 2023 16:40:35 -0400 Subject: [PATCH 044/162] Correct typo --- workflows/sc-rna-reduce.cwl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 4c45d4c8..36e11155 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -145,7 +145,7 @@ inputs: - type: enum symbols: - "completely" - - "partialy" + - "partially" - "do not remove" label: "Remove cell cycle" default: "do not remove" @@ -155,7 +155,7 @@ inputs: reduction results. When selected "completely", regress all signals associated with the cell cycle phase. - For "partialy" - regress only the + For "partially" - regress only the differences in cell cycle phase among proliferating cells, signals separating non-cycling and cycling @@ -559,7 +559,7 @@ steps: valueFrom: $(self.includes("completely")?true:null) regress_ccycle_diff: source: regress_cellcycle - valueFrom: $(self.includes("partialy")?true:null) + valueFrom: $(self.includes("partially")?true:null) datasets_metadata: datasets_metadata normalization_method: normalization_method integration_method: integration_method From b1d814352c41df4769aff56a4f6cc3a0f18351af Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 11 Jul 2023 16:29:54 -0400 Subject: [PATCH 045/162] Refactor sc-rna-cluster pipeline, update docker to the latest --- tools/sc-atac-cluster.cwl | 2 +- tools/sc-atac-coverage.cwl | 2 +- tools/sc-atac-dbinding.cwl | 2 +- tools/sc-atac-reduce.cwl | 2 +- tools/sc-ctype-assign.cwl | 2 +- tools/sc-multiome-filter.cwl | 2 +- tools/sc-rna-cluster.cwl | 117 +++++---- tools/sc-rna-da-cells.cwl | 2 +- tools/sc-rna-de-pseudobulk.cwl | 2 +- tools/sc-rna-filter.cwl | 2 +- tools/sc-rna-reduce.cwl | 2 +- tools/sc-triangulate.cwl | 2 +- tools/sc-wnn-cluster.cwl | 2 +- workflows/sc-rna-cluster.cwl | 421 +++++++++++++++------------------ 14 files changed, 275 insertions(+), 287 deletions(-) diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index edc3df1c..f0449b20 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.24 + dockerPull: biowardrobe2/sc-tools:v0.0.25 inputs: diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index 9a1509c4..c7ab53e9 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.24 + dockerPull: biowardrobe2/sc-tools:v0.0.25 inputs: diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index 2a71381a..ac912288 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.24 + dockerPull: biowardrobe2/sc-tools:v0.0.25 inputs: diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index d60921b9..23879948 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.24 + dockerPull: biowardrobe2/sc-tools:v0.0.25 inputs: diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index ca3f89a4..c332280a 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.24 + dockerPull: biowardrobe2/sc-tools:v0.0.25 inputs: diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index b6449db5..14f59657 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.24 + dockerPull: biowardrobe2/sc-tools:v0.0.25 inputs: diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index 43604ee4..b6fc7399 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.24 + dockerPull: biowardrobe2/sc-tools:v0.0.25 inputs: @@ -265,7 +265,7 @@ outputs: outputBinding: glob: "*_umap_res_*.png" doc: | - Clustered cells UMAP. + UMAP, colored by cluster. PNG format umap_res_plot_pdf: @@ -276,7 +276,7 @@ outputs: outputBinding: glob: "*_umap_res_*.pdf" doc: | - Clustered cells UMAP. + UMAP, colored by cluster. PDF format slh_res_plot_png: @@ -287,7 +287,7 @@ outputs: outputBinding: glob: "*_slh_res_*.png" doc: | - Silhouette scores. Downsampled to max 500 cells per cluster. + Silhouette scores. PNG format slh_res_plot_pdf: @@ -298,7 +298,7 @@ outputs: outputBinding: glob: "*_slh_res_*.pdf" doc: | - Silhouette scores. Downsampled to max 500 cells per cluster. + Silhouette scores. PDF format umap_spl_idnt_res_plot_png: @@ -309,7 +309,8 @@ outputs: outputBinding: glob: "*_umap_spl_idnt_res_*.png" doc: | - Split by dataset clustered cells UMAP. + UMAP, colored by cluster, + split by dataset. PNG format umap_spl_idnt_res_plot_pdf: @@ -320,7 +321,8 @@ outputs: outputBinding: glob: "*_umap_spl_idnt_res_*.pdf" doc: | - Split by dataset clustered cells UMAP. + UMAP, colored by cluster, + split by dataset. PDF format cmp_gr_clst_spl_idnt_res_plot_png: @@ -331,7 +333,9 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_idnt_res_*.png" doc: | - Grouped by cluster split by dataset cells composition plot. Downsampled. + Composition plot, colored by + cluster, split by dataset, + downsampled. PNG format cmp_gr_clst_spl_idnt_res_plot_pdf: @@ -342,7 +346,9 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_idnt_res_*.pdf" doc: | - Grouped by cluster split by dataset cells composition plot. Downsampled. + Composition plot, colored by + cluster, split by dataset, + downsampled. PDF format cmp_gr_idnt_spl_clst_res_plot_png: @@ -353,7 +359,9 @@ outputs: outputBinding: glob: "*_cmp_gr_idnt_spl_clst_res_*.png" doc: | - Grouped by dataset split by cluster cells composition plot. Downsampled. + Composition plot, colored by + dataset, split by cluster, + downsampled. PNG format cmp_gr_idnt_spl_clst_res_plot_pdf: @@ -364,7 +372,9 @@ outputs: outputBinding: glob: "*_cmp_gr_idnt_spl_clst_res_*.pdf" doc: | - Grouped by dataset split by cluster cells composition plot. Downsampled. + Composition plot, colored by + dataset, split by cluster, + downsampled. PDF format umap_spl_cnd_res_plot_png: @@ -375,7 +385,8 @@ outputs: outputBinding: glob: "*_umap_spl_cnd_res_*.png" doc: | - Split by grouping condition clustered cells UMAP. + UMAP, colored by cluster, + split by grouping condition. PNG format umap_spl_cnd_res_plot_pdf: @@ -386,7 +397,8 @@ outputs: outputBinding: glob: "*_umap_spl_cnd_res_*.pdf" doc: | - Split by grouping condition clustered cells UMAP. + UMAP, colored by cluster, + split by grouping condition. PDF format cmp_gr_clst_spl_cnd_res_plot_png: @@ -397,7 +409,9 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_cnd_res_*.png" doc: | - Grouped by cluster split by condition cells composition plot. Downsampled. + Composition plot, colored by + cluster, split by grouping + condition, downsampled. PNG format cmp_gr_clst_spl_cnd_res_plot_pdf: @@ -408,7 +422,9 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_cnd_res_*.pdf" doc: | - Grouped by cluster split by condition cells composition plot. Downsampled. + Composition plot, colored by + cluster, split by grouping + condition, downsampled. PDF format cmp_gr_cnd_spl_clst_res_plot_png: @@ -419,7 +435,9 @@ outputs: outputBinding: glob: "*_cmp_gr_cnd_spl_clst_res_*.png" doc: | - Grouped by condition split by cluster cells composition plot. Downsampled. + Composition plot, colored by + grouping condition, split by + cluster, downsampled. PNG format cmp_gr_cnd_spl_clst_res_plot_pdf: @@ -430,7 +448,9 @@ outputs: outputBinding: glob: "*_cmp_gr_cnd_spl_clst_res_*.pdf" doc: | - Grouped by condition split by cluster cells composition plot. Downsampled. + Composition plot, colored by + grouping condition, split by + cluster, downsampled. PDF format umap_spl_ph_res_plot_png: @@ -441,7 +461,8 @@ outputs: outputBinding: glob: "*_umap_spl_ph_res_*.png" doc: | - Split by cell cycle phase clustered cells UMAP. + UMAP, colored by cluster, + split by cell cycle phase. PNG format umap_spl_ph_res_plot_pdf: @@ -452,7 +473,8 @@ outputs: outputBinding: glob: "*_umap_spl_ph_res_*.pdf" doc: | - Split by cell cycle phase clustered cells UMAP. + UMAP, colored by cluster, + split by cell cycle phase. PDF format cmp_gr_ph_spl_idnt_plot_png: @@ -460,7 +482,9 @@ outputs: outputBinding: glob: "*_cmp_gr_ph_spl_idnt.png" doc: | - Grouped by cell cycle phase split by dataset cells composition plot. Downsampled. + Composition plot, colored by + cell cycle phase, split by + dataset, downsampled. PNG format cmp_gr_ph_spl_idnt_plot_pdf: @@ -468,7 +492,9 @@ outputs: outputBinding: glob: "*_cmp_gr_ph_spl_idnt.pdf" doc: | - Grouped by cell cycle phase split by dataset cells composition plot. Downsampled. + Composition plot, colored by + cell cycle phase, split by + dataset, downsampled. PDF format cmp_gr_ph_spl_clst_res_plot_png: @@ -479,7 +505,9 @@ outputs: outputBinding: glob: "*_cmp_gr_ph_spl_clst_res_*.png" doc: | - Grouped by cell cycle phase split by cluster cells composition plot. Downsampled. + Composition plot, colored by + cell cycle phase, split by + cluster, downsampled. PNG format cmp_gr_ph_spl_clst_res_plot_pdf: @@ -490,7 +518,9 @@ outputs: outputBinding: glob: "*_cmp_gr_ph_spl_clst_res_*.pdf" doc: | - Grouped by cell cycle phase split by cluster cells composition plot. Downsampled. + Composition plot, colored by + cell cycle phase, split by + cluster, downsampled. PDF format xpr_avg_res_plot_png: @@ -501,7 +531,7 @@ outputs: outputBinding: glob: "*_xpr_avg_res_*.png" doc: | - Log normalized scaled average gene expression per cluster. + Gene expression dot plot. PNG format xpr_avg_res_plot_pdf: @@ -512,7 +542,7 @@ outputs: outputBinding: glob: "*_xpr_avg_res_*.pdf" doc: | - Log normalized scaled average gene expression per cluster. + Gene expression dot plot. PDF format xpr_per_cell_plot_png: @@ -523,7 +553,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_[!sgnl_]*.png" doc: | - Log normalized gene expression on cells UMAP. + UMAP, gene expression. PNG format xpr_per_cell_plot_pdf: @@ -534,7 +564,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_[!sgnl_]*.pdf" doc: | - Log normalized gene expression on cells UMAP. + UMAP, gene expression. PDF format xpr_per_cell_sgnl_plot_png: @@ -545,7 +575,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_sgnl_*.png" doc: | - Log normalized gene expression density on cells UMAP. + UMAP, gene expression density. PNG format xpr_per_cell_sgnl_plot_pdf: @@ -556,7 +586,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_sgnl_*.pdf" doc: | - Log normalized gene expression density on cells UMAP. + UMAP, gene expression density. PDF format xpr_dnst_res_plot_png: @@ -567,7 +597,7 @@ outputs: outputBinding: glob: "*_xpr_dnst_res_*.png" doc: | - Log normalized gene expression density per cluster. + Gene expression violin plot. PNG format xpr_dnst_res_plot_pdf: @@ -578,7 +608,7 @@ outputs: outputBinding: glob: "*_xpr_dnst_res_*.pdf" doc: | - Log normalized gene expression density per cluster. + Gene expression violin plot. PDF format xpr_htmp_res_plot_png: @@ -589,7 +619,7 @@ outputs: outputBinding: glob: "*_xpr_htmp_res_*.png" doc: | - Normalized gene expression heatmap grouped by cluster. + Gene expression heatmap. PNG format xpr_htmp_res_plot_pdf: @@ -600,7 +630,7 @@ outputs: outputBinding: glob: "*_xpr_htmp_res_*.pdf" doc: | - Normalized gene expression heatmap grouped by cluster. + Gene expression heatmap. PDF format gene_markers_tsv: @@ -608,7 +638,8 @@ outputs: outputBinding: glob: "*_gene_markers.tsv" doc: | - Differentially expressed genes between each pair of clusters for all resolutions. + Gene markers per cluster for + all resolutions. TSV format ucsc_cb_config_data: @@ -616,21 +647,24 @@ outputs: outputBinding: glob: "*_cellbrowser" doc: | - Directory with UCSC Cellbrowser configuration data. + Directory with UCSC Cellbrowser + configuration data. ucsc_cb_html_data: type: Directory? outputBinding: glob: "*_cellbrowser/html_data" doc: | - Directory with UCSC Cellbrowser html data. + Directory with UCSC Cellbrowser + html data. ucsc_cb_html_file: type: File? outputBinding: glob: "*_cellbrowser/html_data/index.html" doc: | - HTML index file from the directory with UCSC Cellbrowser html data. + HTML index file from the directory + with UCSC Cellbrowser html data. seurat_data_rds: type: File @@ -644,21 +678,24 @@ outputs: outputBinding: glob: "*_data.h5seurat" doc: | - Reduced Seurat data in h5seurat format + Reduced Seurat data in + h5seurat format seurat_data_h5ad: type: File? outputBinding: glob: "*_data.h5ad" doc: | - Reduced Seurat data in h5ad format + Reduced Seurat data in + h5ad format seurat_data_scope: type: File? outputBinding: glob: "*_data.loom" doc: | - Reduced Seurat data in SCope compatible loom format + Reduced Seurat data in + SCope compatible loom format stdout_log: type: stdout diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index 1a91eaa9..1d3ac8e8 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.24 + dockerPull: biowardrobe2/sc-tools:v0.0.25 inputs: diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index 8db7b4a7..d465e37b 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.24 + dockerPull: biowardrobe2/sc-tools:v0.0.25 inputs: diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index 587514f2..f7c0cfe4 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.24 + dockerPull: biowardrobe2/sc-tools:v0.0.25 inputs: diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index bff3ac7c..9302fdda 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.24 + dockerPull: biowardrobe2/sc-tools:v0.0.25 inputs: diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index 31c3c407..4a1a3987 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.24 + dockerPull: biowardrobe2/sc-tools:v0.0.25 inputs: diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index 98c51ae1..4c656b41 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.24 + dockerPull: biowardrobe2/sc-tools:v0.0.25 inputs: diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index a00be673..00560ec2 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -33,102 +33,77 @@ inputs: alias: type: string - label: "Experiment short name/alias" + label: "Analysis name" sd:preview: position: 1 query_data_rds: type: File - label: "Experiment run through Single-cell RNA-Seq Dimensionality Reduction Analysis" + label: "Single-cell Analysis with PCA Transformed RNA-Seq Datasets" doc: | - Path to the RDS file to load Seurat object from. This file should include genes - expression information stored in the RNA assay, as well as 'pca' and 'rnaumap' - dimensionality reductions applied to that assay. + Analysis that includes single-cell + multiome ATAC and RNA-Seq or just + RNA-Seq datasets run through "Single-cell + RNA-Seq Dimensionality Reduction Analysis" + at any of the processing stages. 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" 'sd:localLabel': true dimensions: type: int? default: 40 - label: "Dimensionality to use when constructing nearest-neighbor graph before clustering (from 1 to 50)" + label: "Target dimensionality" doc: | - Dimensionality to use when constructing nearest- - neighbor graph before clustering (from 1 to 50). If - single value N is provided, use from 1 to N - dimensions. If multiple values are provided, subset to - only selected dimensions. - Default: from 1 to 10 - - cluster_algorithm: - type: - - "null" - - type: enum - symbols: - - "louvain" - - "mult-louvain" - - "slm" - - "leiden" - default: "louvain" - label: "Algorithm for modularity optimization when running clustering" - doc: | - Algorithm for modularity optimization when running clustering. - Default: louvain + Number of princinpal components to be + used in constructing nearest-neighbor + graph as part of the clustering + algorithm. Accepted values range from + 1 to 50. + Default: 40 resolution: type: float? default: 0.3 label: "Clustering resolution" doc: | - Clustering resolution applied to the constructed nearest-neighbor graph. - Can be set as an array but only the first item from the list will be used - for cluster labels and gene markers in the UCSC Cell Browser when running - with --cbbuild and --diffgenes parameters. - Default: 0.3, 0.5, 1.0 + Resolution to define the "granularity" + of the clustered data. Larger values + lead to a bigger number of clusters. + Optimal resolution often increases + with the number of cells. For a dataset + of 3K cells, the value within 0.4-1.2 + range usually returns good results. + Default: 0.3 + + identify_diff_genes: + type: boolean? + default: true + label: "Find gene markers" + doc: | + Identify upregulated genes in each + cluster compared to all other cells. + Include only genes that are expressed + in at least 10% of the cells coming + from either current cluster or from + all other clusters together. + Exclude cells with log2FoldChange + values less than 0.25. Use Wilcoxon + Rank Sum test to calculate P-values. + Keep only genes with P-values lower + than 0.01. Adjust P-values for multiple + comparisons using Bonferroni correction. + Default: true genes_of_interest: type: string? default: null - label: "Comma or space separated list of genes of interest" + label: "Genes of interest" doc: | - Genes of interest to build genes expression plots. + Comma or space separated list of + genes of interest to visualize + expression. Default: None - identify_diff_genes: - type: boolean? - default: false - label: "Identify differentially expressed genes between each pair of clusters" - doc: | - Identify differentially expressed genes (putative gene markers) between each - pair of clusters for all resolutions. - Default: false - 'sd:layout': - advanced: true - - minimum_logfc: - type: float? - default: 0.25 - label: "Include only those genes that on average have log fold change difference in expression between every tested pair of clusters not lower than this value" - doc: | - For putative gene markers identification include only those genes that - on average have log fold change difference in expression between every - tested pair of clusters not lower than this value. Ignored if '--diffgenes' - is not set. - Default: 0.25 - 'sd:layout': - advanced: true - - minimum_pct: - type: float? - default: 0.1 - label: "Include only those genes that are detected in not lower than this fraction of cells in either of the two tested clusters" - doc: | - For putative gene markers identification include only those genes that - are detected in not lower than this fraction of cells in either of the - two tested clusters. Ignored if '--diffgenes' is not set. - Default: 0.1 - 'sd:layout': - advanced: true - color_theme: type: - "null" @@ -143,41 +118,12 @@ inputs: - "classic" - "void" default: "classic" - label: "Color theme for all generated plots" + label: "Plots color theme" doc: | - Color theme for all generated plots. One of gray, bw, linedraw, light, - dark, minimal, classic, void. + Color theme for all plots saved + as PNG files. Default: classic - 'sd:layout': - advanced: true - - parallel_memory_limit: - type: - - "null" - - type: enum - symbols: - - "32" - default: "32" - label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" - doc: | - Maximum memory in GB allowed to be shared between the workers - when using multiple --cpus. - Forced to 32 GB - 'sd:layout': - advanced: true - - vector_memory_limit: - type: - - "null" - - type: enum - symbols: - - "64" - default: "64" - label: "Maximum vector memory in GB allowed to be used by R" - doc: | - Maximum vector memory in GB allowed to be used by R. - Forced to 64 GB - 'sd:layout': + "sd:layout": advanced: true threads: @@ -186,12 +132,15 @@ inputs: - type: enum symbols: - "1" + - "2" default: "1" - label: "Number of cores/cpus to use" + label: "Cores/CPUs number" doc: | - Number of cores/cpus to use - Forced to 1 - 'sd:layout': + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 1 + "sd:layout": advanced: true @@ -203,14 +152,13 @@ outputs: - type: array items: File outputSource: sc_rna_cluster/umap_res_plot_png - label: "Clustered cells UMAP" + label: "UMAP, colored by cluster" doc: | - Clustered cells UMAP. - PNG format + UMAP, colored by cluster 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Clustered cells UMAP' + tab: 'Per cluster' + Caption: 'UMAP, colored by cluster' slh_res_plot_png: type: @@ -218,14 +166,44 @@ outputs: - type: array items: File outputSource: sc_rna_cluster/slh_res_plot_png - label: "Silhouette scores. Downsampled to max 500 cells per cluster." + label: "Silhouette scores" + doc: | + Silhouette scores + 'sd:visualPlugins': + - image: + tab: 'Per cluster' + Caption: 'Silhouette scores' + + umap_spl_ph_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/umap_spl_ph_res_plot_png + label: "UMAP, colored by cluster, split by cell cycle phase" + doc: | + UMAP, colored by cluster, + split by cell cycle phase + 'sd:visualPlugins': + - image: + tab: 'Per cluster' + Caption: 'UMAP, colored by cluster, split by cell cycle phase' + + cmp_gr_ph_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/cmp_gr_ph_spl_clst_res_plot_png + label: "Composition plot, colored by cell cycle phase, split by cluster, downsampled" doc: | - Silhouette scores. Downsampled to max 500 cells per cluster. - PNG format + Composition plot, colored by + cell cycle phase, split by + cluster, downsampled 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Silhouette scores. Downsampled to max 500 cells per cluster.' + tab: 'Per cluster' + Caption: 'Composition plot, colored by cell cycle phase, split by cluster, downsampled' umap_spl_idnt_res_plot_png: type: @@ -233,14 +211,14 @@ outputs: - type: array items: File outputSource: sc_rna_cluster/umap_spl_idnt_res_plot_png - label: "Split by dataset clustered cells UMAP" + label: "UMAP, colored by cluster, split by dataset" doc: | - Split by dataset clustered cells UMAP. - PNG format + UMAP, colored by cluster, + split by dataset 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by dataset clustered cells UMAP' + Caption: 'UMAP, colored by cluster, split by dataset' cmp_gr_clst_spl_idnt_res_plot_png: type: @@ -248,14 +226,15 @@ outputs: - type: array items: File outputSource: sc_rna_cluster/cmp_gr_clst_spl_idnt_res_plot_png - label: "Grouped by cluster split by dataset cells composition plot. Downsampled." + label: "Composition plot, colored by cluster, split by dataset, downsampled" doc: | - Grouped by cluster split by dataset cells composition plot. Downsampled. - PNG format + Composition plot, colored by + cluster, split by dataset, + downsampled 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Grouped by cluster split by dataset cells composition plot. Downsampled.' + tab: 'Per dataset' + Caption: 'Composition plot, colored by cluster, split by dataset, downsampled' cmp_gr_idnt_spl_clst_res_plot_png: type: @@ -263,14 +242,28 @@ outputs: - type: array items: File outputSource: sc_rna_cluster/cmp_gr_idnt_spl_clst_res_plot_png - label: "Grouped by dataset split by cluster cells composition plot. Downsampled." + label: "Composition plot, colored by dataset, split by cluster, downsampled" + doc: | + Composition plot, colored by + dataset, split by cluster, + downsampled + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Composition plot, colored by dataset, split by cluster, downsampled' + + cmp_gr_ph_spl_idnt_plot_png: + type: File? + outputSource: sc_rna_cluster/cmp_gr_ph_spl_idnt_plot_png + label: "Composition plot, colored by cell cycle phase, split by dataset, downsampled" doc: | - Grouped by dataset split by cluster cells composition plot. Downsampled. - PNG format + Composition plot, colored by + cell cycle phase, split by + dataset, downsampled 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Grouped by dataset split by cluster cells composition plot. Downsampled.' + tab: 'Per dataset' + Caption: 'Composition plot, colored by cell cycle phase, split by dataset, downsampled' umap_spl_cnd_res_plot_png: type: @@ -278,14 +271,14 @@ outputs: - type: array items: File outputSource: sc_rna_cluster/umap_spl_cnd_res_plot_png - label: "Split by grouping condition clustered cells UMAP" + label: "UMAP, colored by cluster, split by grouping condition" doc: | - Split by grouping condition clustered cells UMAP. - PNG format + UMAP, colored by cluster, + split by grouping condition 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'Split by grouping condition clustered cells UMAP' + Caption: 'UMAP, colored by cluster, split by grouping condition' cmp_gr_clst_spl_cnd_res_plot_png: type: @@ -293,14 +286,15 @@ outputs: - type: array items: File outputSource: sc_rna_cluster/cmp_gr_clst_spl_cnd_res_plot_png - label: "Grouped by cluster split by condition cells composition plot. Downsampled." + label: "Composition plot, colored by cluster, split by grouping condition, downsampled" doc: | - Grouped by cluster split by condition cells composition plot. Downsampled. - PNG format + Composition plot, colored by + cluster, split by grouping + condition, downsampled 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'Grouped by cluster split by condition cells composition plot. Downsampled.' + Caption: 'Composition plot, colored by cluster, split by grouping condition, downsampled' cmp_gr_cnd_spl_clst_res_plot_png: type: @@ -308,71 +302,43 @@ outputs: - type: array items: File outputSource: sc_rna_cluster/cmp_gr_cnd_spl_clst_res_plot_png - label: "Grouped by condition split by cluster cells composition plot. Downsampled." + label: "Composition plot, colored by grouping condition, split by cluster, downsampled" doc: | - Grouped by condition split by cluster cells composition plot. Downsampled. - PNG format + Composition plot, colored by + grouping condition, split by + cluster, downsampled 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'Grouped by condition split by cluster cells composition plot. Downsampled.' - - umap_spl_ph_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_rna_cluster/umap_spl_ph_res_plot_png - label: "Split by cell cycle phase clustered cells UMAP" - doc: | - Split by cell cycle phase clustered cells UMAP. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Per dataset' - Caption: 'Split by cell cycle phase clustered cells UMAP' - - cmp_gr_ph_spl_idnt_plot_png: - type: File? - outputSource: sc_rna_cluster/cmp_gr_ph_spl_idnt_plot_png - label: "Grouped by cell cycle phase split by dataset cells composition plot. Downsampled." - doc: | - Grouped by cell cycle phase split by dataset cells composition plot. Downsampled. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Per dataset' - Caption: 'Grouped by cell cycle phase split by dataset cells composition plot. Downsampled.' + Caption: 'Composition plot, colored by grouping condition, split by cluster, downsampled' - cmp_gr_ph_spl_clst_res_plot_png: + xpr_avg_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_rna_cluster/cmp_gr_ph_spl_clst_res_plot_png - label: "Grouped by cell cycle phase split by cluster cells composition plot. Downsampled." + outputSource: sc_rna_cluster/xpr_avg_res_plot_png + label: "Gene expression dot plot" doc: | - Grouped by cell cycle phase split by cluster cells composition plot. Downsampled. - PNG format + Gene expression dot plot 'sd:visualPlugins': - image: - tab: 'Per dataset' - Caption: 'Grouped by cell cycle phase split by cluster cells composition plot. Downsampled.' + tab: 'Genes of interest' + Caption: 'Gene expression dot plot' - xpr_avg_res_plot_png: + xpr_dnst_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_rna_cluster/xpr_avg_res_plot_png - label: "Log normalized scaled average gene expression per cluster" + outputSource: sc_rna_cluster/xpr_dnst_res_plot_png + label: "Gene expression violin plot" doc: | - Log normalized scaled average gene expression per cluster. - PNG format + Gene expression violin plot 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Log normalized scaled average gene expression per cluster' + tab: 'Genes of interest' + Caption: 'Gene expression violin plot' xpr_per_cell_plot_png: type: @@ -380,14 +346,13 @@ outputs: - type: array items: File outputSource: sc_rna_cluster/xpr_per_cell_plot_png - label: "Log normalized gene expression on cells UMAP" + label: "UMAP, gene expression" doc: | - Log normalized gene expression on cells UMAP. - PNG format + UMAP, gene expression 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression on cells UMAP' + tab: 'Genes of interest' + Caption: 'UMAP, gene expression' xpr_per_cell_sgnl_plot_png: type: @@ -395,29 +360,13 @@ outputs: - type: array items: File outputSource: sc_rna_cluster/xpr_per_cell_sgnl_plot_png - label: "Log normalized gene expression density on cells UMAP" + label: "UMAP, gene expression density" doc: | - Log normalized gene expression density on cells UMAP. - PNG format + UMAP, gene expression density 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression density on cells UMAP' - - xpr_dnst_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_rna_cluster/xpr_dnst_res_plot_png - label: "Log normalized gene expression density per cluster" - doc: | - Log normalized gene expression density per cluster. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression density per cluster' + tab: 'Genes of interest' + Caption: 'UMAP, gene expression density' xpr_htmp_res_plot_png: type: @@ -425,43 +374,43 @@ outputs: - type: array items: File outputSource: sc_rna_cluster/xpr_htmp_res_plot_png - label: "Normalized gene expression heatmap grouped by cluster" + label: "Gene expression heatmap" doc: | - Normalized gene expression heatmap grouped by cluster. - PNG format + Gene expression heatmap 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Normalized gene expression heatmap grouped by cluster' + tab: 'Heatmap' + Caption: 'Gene expression heatmap' gene_markers_tsv: type: File? outputSource: sc_rna_cluster/gene_markers_tsv - label: "Differentially expressed genes between each pair of clusters" + label: "Gene markers per cluster for all resolutions" doc: | - Differentially expressed genes between each pair of clusters for all resolutions. - TSV format + Gene markers per cluster for + all resolutions 'sd:visualPlugins': - syncfusiongrid: tab: 'Gene markers' - Title: 'Differentially expressed genes between each pair of clusters' + Title: 'Gene markers per cluster for all resolutions' ucsc_cb_html_data: - type: Directory + type: Directory? outputSource: sc_rna_cluster/ucsc_cb_html_data - label: "Directory with UCSC Cellbrowser html data" + label: "UCSC Cell Browser data" doc: | - Directory with UCSC Cellbrowser html data. + Directory with UCSC Cell Browser + data ucsc_cb_html_file: - type: File + type: File? outputSource: sc_rna_cluster/ucsc_cb_html_file - label: "Open in UCSC Cell Browser" + label: "UCSC Cell Browser" doc: | - HTML index file from the directory with UCSC Cellbrowser html data. - 'sd:visualPlugins': + UCSC Cell Browser HTML index file + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" seurat_data_rds: @@ -504,18 +453,21 @@ steps: dimensions: dimensions cluster_metric: default: euclidean - cluster_algorithm: cluster_algorithm + cluster_algorithm: + default: "louvain" resolution: resolution genes_of_interest: source: genes_of_interest valueFrom: $(split_features(self)) identify_diff_genes: identify_diff_genes - minimum_logfc: minimum_logfc - minimum_pct: minimum_pct only_positive_diff_genes: default: true test_to_use: default: wilcox + minimum_logfc: + default: 0.25 + minimum_pct: + default: 0.1 verbose: default: true export_ucsc_cb: @@ -524,11 +476,9 @@ steps: default: true color_theme: color_theme parallel_memory_limit: - source: parallel_memory_limit - valueFrom: $(parseInt(self)) + default: 32 vector_memory_limit: - source: vector_memory_limit - valueFrom: $(parseInt(self)) + default: 96 threads: source: threads valueFrom: $(parseInt(self)) @@ -605,5 +555,6 @@ s:creator: doc: | Single-cell RNA-Seq Cluster Analysis - =============================================================== - Clusters single-cell RNA-Seq datasets, identifies gene markers. \ No newline at end of file + + Clusters single-cell RNA-Seq datasets, + identifies gene markers. \ No newline at end of file From 33a1dc841ed4ecd78091ade1060be61000885012 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 11 Jul 2023 17:15:11 -0400 Subject: [PATCH 046/162] Fix bug in sc-atac-reduce pipeline --- workflows/sc-atac-reduce.cwl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index 27da9073..0137ae22 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -428,7 +428,7 @@ outputs: ucsc_cb_html_data: type: Directory? - outputSource: sc_rna_reduce/ucsc_cb_html_data + outputSource: sc_atac_reduce/ucsc_cb_html_data label: "UCSC Cell Browser data" doc: | Directory with UCSC Cell Browser @@ -436,7 +436,7 @@ outputs: ucsc_cb_html_file: type: File? - outputSource: sc_rna_reduce/ucsc_cb_html_file + outputSource: sc_atac_reduce/ucsc_cb_html_file label: "UCSC Cell Browser" doc: | UCSC Cell Browser HTML index file From 71889664e7d5b993b61d405bb2365f2be1822f24 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 12 Jul 2023 11:52:22 -0400 Subject: [PATCH 047/162] Update sc-rna-reduce pipeline to include input for a custom cell cycle genes file --- workflows/sc-rna-reduce.cwl | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 36e11155..246d3be1 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -202,6 +202,17 @@ inputs: RNA-Seq Datasets" and can be utilized in the current or future steps of analysis. + custom_cell_cycle_data: + type: File? + label: "Custom cell cycle gene set (optional)" + doc: | + A TSV/CSV file with the gene list + for cell cycle score assignment. + The file should have two columns + named 'phase' and 'gene_id'. If + this input is provided, the "Cell + cycle gene set" will be ignored. + highly_var_genes_count: type: int? label: "Number of highly variable genes" @@ -543,12 +554,14 @@ steps: query_data_rds: query_data_rds barcodes_data: barcodes_data cell_cycle_data: - source: cell_cycle_data + source: [cell_cycle_data, custom_cell_cycle_data] valueFrom: | ${ - if (self.includes("human")) { + if (self[1] != null && self[1].class == "File"){ + return self[1]; + } else if (self[0].includes("human")) { return "hg38"; - } else if (self.includes("mouse")) { + } else if (self[0].includes("mouse")) { return "mm10"; } else { return null; From a33a19d323dc34420eacfb5ae16c2e6b4df0c779 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 12 Jul 2023 14:17:00 -0400 Subject: [PATCH 048/162] Fix typo --- workflows/sc-rna-cluster.cwl | 2 +- workflows/sc-rna-reduce.cwl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index 00560ec2..027c5f55 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -54,7 +54,7 @@ inputs: default: 40 label: "Target dimensionality" doc: | - Number of princinpal components to be + Number of principal components to be used in constructing nearest-neighbor graph as part of the clustering algorithm. Accepted values range from diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 246d3be1..f4091068 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -116,7 +116,7 @@ inputs: label: "Target dimensionality" default: 40 doc: | - Number of princinpal components to be used + Number of principal components to be used in PCA and UMAP projection. Accepted values range from 1 to 50. Default: 40 From 877695b8721cb3b47b3d6f3d163bb9b184688e08 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 12 Jul 2023 15:28:03 -0400 Subject: [PATCH 049/162] Set default normalization method to None in Cell Ranger Aggregare workflow --- workflows/cellranger-aggr.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cellranger-aggr.cwl b/workflows/cellranger-aggr.cwl index a3832a81..a4b6aba6 100644 --- a/workflows/cellranger-aggr.cwl +++ b/workflows/cellranger-aggr.cwl @@ -55,7 +55,7 @@ inputs: symbols: - "none" - "mapped" - default: "mapped" + default: "none" label: "Library depth normalization mode" doc: "Library depth normalization mode" 'sd:layout': From c14c0b6336d17a541c5bdfe30415a6f0f667f410 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 12 Jul 2023 21:13:30 -0400 Subject: [PATCH 050/162] Refactor sc-atac-reduce workflow --- tools/sc-atac-reduce.cwl | 175 +++++++++++-- workflows/sc-atac-reduce.cwl | 486 ++++++++++++++++++----------------- 2 files changed, 398 insertions(+), 263 deletions(-) diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index 23879948..7911474f 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -308,7 +308,7 @@ outputs: outputBinding: glob: "*_qc_dim_corr.png" doc: | - Correlation plots between QC metrics and cells LSI dimensions. + Correlation between QC metrics and LSI components. PNG format qc_dim_corr_plot_pdf: @@ -316,7 +316,7 @@ outputs: outputBinding: glob: "*_qc_dim_corr.pdf" doc: | - Correlation plots between QC metrics and cells LSI dimensions. + Correlation between QC metrics and LSI components. PDF format umap_qc_mtrcs_plot_png: @@ -324,7 +324,7 @@ outputs: outputBinding: glob: "*_umap_qc_mtrcs.png" doc: | - QC metrics on cells UMAP. + UMAP, QC metrics. PNG format umap_qc_mtrcs_plot_pdf: @@ -332,7 +332,7 @@ outputs: outputBinding: glob: "*_umap_qc_mtrcs.pdf" doc: | - QC metrics on cells UMAP. + UMAP, QC metrics. PDF format umap_plot_png: @@ -340,7 +340,7 @@ outputs: outputBinding: glob: "*_umap.png" doc: | - Cells UMAP. + UMAP, colored by dataset. PNG format umap_plot_pdf: @@ -348,7 +348,7 @@ outputs: outputBinding: glob: "*_umap.pdf" doc: | - Cells UMAP. + UMAP, colored by dataset. PDF format umap_spl_idnt_plot_png: @@ -356,7 +356,7 @@ outputs: outputBinding: glob: "*_umap_spl_idnt.png" doc: | - Split by dataset cells UMAP. + UMAP, split by dataset. PNG format umap_spl_idnt_plot_pdf: @@ -364,7 +364,7 @@ outputs: outputBinding: glob: "*_umap_spl_idnt.pdf" doc: | - Split by dataset cells UMAP. + UMAP, split by dataset. PDF format umap_spl_cnd_plot_png: @@ -372,7 +372,8 @@ outputs: outputBinding: glob: "*_umap_spl_cnd.png" doc: | - Split by grouping condition cells UMAP. + UMAP, colored by dataset, split + by grouping condition. PNG format umap_spl_cnd_plot_pdf: @@ -380,7 +381,8 @@ outputs: outputBinding: glob: "*_umap_spl_cnd.pdf" doc: | - Split by grouping condition cells UMAP. + UMAP, colored by dataset, split + by grouping condition. PDF format umap_spl_frgm_plot_png: @@ -388,7 +390,8 @@ outputs: outputBinding: glob: "*_umap_spl_frgm.png" doc: | - Split by the fragments in peaks per cell counts cells UMAP. + UMAP, colored by dataset, split + by fragments in peaks per cell. PNG format umap_spl_frgm_plot_pdf: @@ -396,7 +399,8 @@ outputs: outputBinding: glob: "*_umap_spl_frgm.pdf" doc: | - Split by the fragments in peaks per cell counts cells UMAP. + UMAP, colored by dataset, split + by fragments in peaks per cell. PDF format umap_spl_peak_plot_png: @@ -404,7 +408,8 @@ outputs: outputBinding: glob: "*_umap_spl_peak.png" doc: | - Split by the peaks per cell counts cells UMAP. + UMAP, colored by dataset, split + by peaks per cell. PNG format umap_spl_peak_plot_pdf: @@ -412,7 +417,8 @@ outputs: outputBinding: glob: "*_umap_spl_peak.pdf" doc: | - Split by the peaks per cell counts cells UMAP. + UMAP, colored by dataset, split + by peaks per cell. PDF format umap_spl_tss_plot_png: @@ -420,7 +426,8 @@ outputs: outputBinding: glob: "*_umap_spl_tss.png" doc: | - Split by the TSS enrichment score cells UMAP. + UMAP, colored by dataset, split + by TSS enrichment score. PNG format umap_spl_tss_plot_pdf: @@ -428,7 +435,8 @@ outputs: outputBinding: glob: "*_umap_spl_tss.pdf" doc: | - Split by the TSS enrichment score cells UMAP. + UMAP, colored by dataset, split + by TSS enrichment score. PDF format umap_spl_ncls_plot_png: @@ -436,7 +444,8 @@ outputs: outputBinding: glob: "*_umap_spl_ncls.png" doc: | - Split by the nucleosome signal cells UMAP. + UMAP, colored by dataset, split + by nucleosome signal. PNG format umap_spl_ncls_plot_pdf: @@ -444,7 +453,8 @@ outputs: outputBinding: glob: "*_umap_spl_ncls.pdf" doc: | - Split by the nucleosome signal cells UMAP. + UMAP, colored by dataset, split + by nucleosome signal. PDF format umap_spl_frip_plot_png: @@ -452,7 +462,8 @@ outputs: outputBinding: glob: "*_umap_spl_frip.png" doc: | - Split by the FRiP cells UMAP. + UMAP, colored by dataset, + split by FRiP. PNG format umap_spl_frip_plot_pdf: @@ -460,7 +471,8 @@ outputs: outputBinding: glob: "*_umap_spl_frip.pdf" doc: | - Split by the FRiP cells UMAP. + UMAP, colored by dataset, + split by FRiP. PDF format umap_spl_blck_plot_png: @@ -468,7 +480,8 @@ outputs: outputBinding: glob: "*_umap_spl_blck.png" doc: | - Split by the genomic blacklist regions fraction cells UMAP. + UMAP, colored by dataset, split + by blacklist fraction. PNG format umap_spl_blck_plot_pdf: @@ -476,7 +489,116 @@ outputs: outputBinding: glob: "*_umap_spl_blck.pdf" doc: | - Split by the genomic blacklist regions fraction cells UMAP. + UMAP, colored by dataset, split + by blacklist fraction. + PDF format + + umap_gr_cnd_spl_frgm_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_frgm.png" + doc: | + UMAP, colored by grouping condition, + split by fragments in peaks per cell. + PNG format + + umap_gr_cnd_spl_frgm_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_frgm.pdf" + doc: | + UMAP, colored by grouping condition, + split by fragments in peaks per cell. + PDF format + + umap_gr_cnd_spl_peak_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_peak.png" + doc: | + UMAP, colored by grouping condition, + split by peaks per cell. + PNG format + + umap_gr_cnd_spl_peak_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_peak.pdf" + doc: | + UMAP, colored by grouping condition, + split by peaks per cell. + PDF format + + umap_gr_cnd_spl_tss_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_tss.png" + doc: | + UMAP, colored by grouping condition, + split by TSS enrichment score. + PNG format + + umap_gr_cnd_spl_tss_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_tss.pdf" + doc: | + UMAP, colored by grouping condition, + split by TSS enrichment score. + PDF format + + umap_gr_cnd_spl_ncls_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_ncls.png" + doc: | + UMAP, colored by grouping condition, + split by nucleosome signal. + PNG format + + umap_gr_cnd_spl_ncls_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_ncls.pdf" + doc: | + UMAP, colored by grouping condition, + split by nucleosome signal. + PDF format + + umap_gr_cnd_spl_frip_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_frip.png" + doc: | + UMAP, colored by grouping condition, + split by FRiP. + PNG format + + umap_gr_cnd_spl_frip_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_frip.pdf" + doc: | + UMAP, colored by grouping condition, + split by FRiP. + PDF format + + umap_gr_cnd_spl_blck_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_blck.png" + doc: | + UMAP, colored by grouping condition, + split by blacklist fraction. + PNG format + + umap_gr_cnd_spl_blck_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_gr_cnd_spl_blck.pdf" + doc: | + UMAP, colored by grouping condition, + split by blacklist fraction. PDF format ucsc_cb_config_data: @@ -484,21 +606,24 @@ outputs: outputBinding: glob: "*_cellbrowser" doc: | - Directory with UCSC Cellbrowser configuration data. + Directory with UCSC Cellbrowser + configuration data. ucsc_cb_html_data: type: Directory? outputBinding: glob: "*_cellbrowser/html_data" doc: | - Directory with UCSC Cellbrowser html data. + Directory with UCSC Cellbrowser + html data. ucsc_cb_html_file: type: File? outputBinding: glob: "*_cellbrowser/html_data/index.html" doc: | - HTML index file from the directory with UCSC Cellbrowser html data. + HTML index file from the directory + with UCSC Cellbrowser html data. seurat_data_rds: type: File diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index 0137ae22..04e31fec 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -27,61 +27,21 @@ inputs: alias: type: string - label: "Experiment short name/alias" + label: "Analysis name" sd:preview: position: 1 query_data_rds: type: File - label: "Experiment run through Single-cell Multiome ATAC and RNA-Seq Filtering Analysis" + label: "Single-cell Analysis with Filtered ATAC-Seq Datasets" doc: | - Path to the RDS file to load Seurat object from. This file should include - chromatin accessibility information stored in the ATAC assay. + Any analysis that includes single-cell + multiome ATAC and RNA-Seq or just + ATAC-Seq datasets filtered by QC metrics + to include only high-quality cells. 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" 'sd:localLabel': true - datasets_metadata: - type: File? - label: "Path to the TSV/CSV file to optionally extend Seurat object metadata with categorical values" - doc: | - Path to the TSV/CSV file to optionally extend Seurat - object metadata with categorical values using samples - identities. First column - 'library_id' should - correspond to all unique values from the 'new.ident' - column of the loaded Seurat object. If any of the - provided in this file columns are already present in - the Seurat object metadata, they will be overwritten. - When combined with --barcodes parameter, first the - metadata will be extended, then barcode filtering will - be applied. - Default: no extra metadata is added - - barcodes_data: - type: File? - label: "Optional TSV/CSV file to prefilter and extend metadata be barcodes. First column should be named as 'barcode'" - doc: | - Path to the TSV/CSV file to optionally prefilter and - extend Seurat object metadata be selected barcodes. - First column should be named as 'barcode'. If file - includes any other columns they will be added to the - Seurat object metadata ovewriting the existing ones if - those are present. - Default: all cells used, no extra metadata is added - - dimensions: - type: int? - label: "Dimensionality to use for datasets integration and UMAP projection (from 2 to 50)" - default: 40 - doc: | - Dimensionality to use for datasets integration and - UMAP projection (from 2 to 50). If single value N is - provided, use from 2 to N LSI components. If multiple - values are provided, subset to only selected LSI - components. In combination with --ntgr set to harmony, - multiple values will result in using all dimensions - starting from 1(!) to the max of the provided values. - Default: from 2 to 10 - normalization_method: type: - "null" @@ -91,16 +51,18 @@ inputs: - "tf-logidf" - "logtf-logidf" - "idf" - label: "TF-IDF normalization method applied to chromatin accessibility counts" + label: "Normalization method" default: "log-tfidf" doc: | - TF-IDF normalization method applied to chromatin - accessibility counts. log-tfidf - Stuart & Butler et - al. 2019, tf-logidf - Cusanovich & Hill et al. 2018, - logtf-logidf - Andrew Hill, idf - 10x Genomics, + TF-IDF normalization method to correct + for differences in cellular sequencing + depth. "log-tfidf" - Stuart & Butler + et al. 2019. "tf-logidf" - Cusanovich & + Hill et al. 2018. "logtf-logidf" - Andrew + Hill. "idf" - 10x Genomics. For more + details refer to + https://stuartlab.org/signac/reference/runtfidf Default: log-tfidf - 'sd:layout': - advanced: true integration_method: type: @@ -110,108 +72,104 @@ inputs: - "signac" - "harmony" - "none" - label: "Integration method used for joint analysis of multiple datasets" + label: "Integration method" default: "signac" doc: | - Integration method used for joint analysis of multiple - datasets. Automatically set to 'none' if loaded Suerat - object includes only one dataset. Default: signac - 'sd:layout': - advanced: true + Integration method to match shared cell + types and states across experimental + batches, donors, conditions, or datasets. + "signac" - use cross-dataset pairs of + cells that are in a matched biological + state ("anchors") to correct for technical + differences. "harmony" - use Harmony + algorithm described in Korsunsky, Millard, + and Fan, Nat Methods, 2019, to iteratively + correct LSI embeddings. "none" - do not + run integration, merge datasets instead. + Default: signac integrate_by: - type: string? - label: "Variable(s) to be integrated out when running multiple integration with Harmony" - default: "new.ident" - doc: | - Column(s) from the Seurat object metadata to define - the variable(s) that should be integrated out when - running multiple datasets integration with harmony. - May include columns from the extra metadata added with - --metadata parameter. Ignored if --ntgr is not set to - harmony. - Default: new.ident - 'sd:layout': - advanced: true + type: + - "null" + - string + - type: enum + symbols: + - "dataset" + - "condition" + label: "Batch correction (harmony)" + default: "dataset" + doc: | + When "harmony" is selected as "Integration + method", batch effects are corrected based + on the provided factors. Specifically, + "dataset" is used to integrate out the + influence of the cells' dataset of origin, + while the factor "condition" is used to + eliminate the influence of dataset grouping. + Default: dataset - minimum_var_peaks_perc: + dimensions: type: int? - label: "Minimum percentile for identifying the top most common peaks as highly variable" - default: 0 + label: "Target dimensionality" + default: 40 doc: | - Minimum percentile for identifying the top most common peaks as highly variable. - For example, setting to 5 will use the the top 95 percent most common among all cells - peaks as highly variable. These peaks are used for datasets integration, scaling - and dimensionality reduction. - Default: 0 (use all available peaks) - 'sd:layout': - advanced: true + Number of dimensions to be used in LSI, + datasets integration, and UMAP projection. + Accepted values range from 2 to 50. First + dimension is always excluded, unless + "Integration method" is set to "harmony". + Default: 40 - umap_spread: - type: float? - label: "UMAP Spread - the effective scale of embedded points (determines how clustered/clumped the embedded points are)" - default: 1 - doc: | - The effective scale of embedded points on UMAP. In combination with '--mindist' - it determines how clustered/clumped the embedded points are. - Default: 1 - 'sd:layout': - advanced: true + datasets_metadata: + type: File? + label: "Datasets metadata (optional)" + doc: | + If the selected single-cell analysis + includes multiple aggregated datasets, + each of them can be assigned to a + separate group by one or multiple + categories. This can be achieved by + providing a TSV/CSV file with + "library_id" as the first column and + any number of additional columns with + unique names, representing the desired + grouping categories. To obtain a proper + template of this file, download + "datasets_metadata.tsv" output from the + "Files" tab of the selected "Single-cell + Analysis with Filtered ATAC-Seq Datasets" + and add extra columns as needed. - umap_mindist: - type: float? - label: "UMAP Min. Dist. - controls how tightly the embedding is allowed compress points together" - default: 0.3 - doc: | - Controls how tightly the embedding is allowed compress points together on UMAP. - Larger values ensure embedded points are moreevenly distributed, while smaller - values allow the algorithm to optimise more accurately with regard to local structure. - Sensible values are in the range 0.001 to 0.5. - Default: 0.3 - 'sd:layout': - advanced: true + barcodes_data: + type: File? + label: "Selected cell barcodes (optional)" + doc: | + A TSV/CSV file to optionally prefilter + the single cell data by including only + the cells with the selected barcodes. + The provided file should include at + least one column named "barcode", with + one cell barcode per line. All other + columns, except for "barcode", will be + added to the single cell metadata loaded + from "Single-cell Analysis with Filtered + ATAC-Seq Datasets" and can be utilized in + the current or future steps of analysis. - umap_neighbors: + minimum_var_peaks_perc: type: int? - label: "UMAP Neighbors Number - determines the number of neighboring points used" - default: 30 + label: "Minimum percentile of highly variable peaks" + default: 0 doc: | - Determines the number of neighboring points used in UMAP. Larger values will result - in more global structure being preserved at the loss of detailed local structure. - In general this parameter should often be in the range 5 to 50. - Default: 30 - 'sd:layout': - advanced: true - - umap_metric: - type: - - "null" - - type: enum - symbols: - - "euclidean" - - "cosine" - - "correlation" - label: "UMAP Dist. Metric - the metric to use to compute distances in high dimensional space" - default: "cosine" - doc: | - The metric to use to compute distances in high dimensional space for UMAP. - Default: cosine - 'sd:layout': - advanced: true - - umap_method: - type: - - "null" - - type: enum - symbols: - - "uwot" - - "uwot-learn" - - "umap-learn" - label: "UMAP implementation to run (if set to 'umap-learn' use 'correlation' distance metric)" - default: "uwot" - doc: | - UMAP implementation to run. If set to 'umap-learn' use --umetric 'correlation' - Default: uwot + Minimum percentile for identifying + the top most common peaks as highly + variable. For example, setting to 5 + will use the the top 95 percent most + common among all cells peaks as highly + variable. Selected peaks are then being + used for datasets integration, scaling + and dimensionality reduction. + Default: 0 (use all available peaks) 'sd:layout': advanced: true @@ -239,41 +197,12 @@ inputs: - "classic" - "void" default: "classic" - label: "Color theme for all generated plots" + label: "Plots color theme" doc: | - Color theme for all generated plots. One of gray, bw, linedraw, light, - dark, minimal, classic, void. + Color theme for all plots saved + as PNG files. Default: classic - 'sd:layout': - advanced: true - - parallel_memory_limit: - type: - - "null" - - type: enum - symbols: - - "32" - default: "32" - label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" - doc: | - Maximum memory in GB allowed to be shared between the workers - when using multiple --cpus. - Forced to 32 GB - 'sd:layout': - advanced: true - - vector_memory_limit: - type: - - "null" - - type: enum - symbols: - - "96" - default: "96" - label: "Maximum vector memory in GB allowed to be used by R" - doc: | - Maximum vector memory in GB allowed to be used by R. - Forced to 96 GB - 'sd:layout': + "sd:layout": advanced: true threads: @@ -283,12 +212,14 @@ inputs: symbols: - "1" - "2" - default: "2" - label: "Number of cores/cpus to use" + default: "1" + label: "Cores/CPUs number" doc: | - Number of cores/cpus to use - Forced to 2 - 'sd:layout': + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 1 + "sd:layout": advanced: true @@ -297,134 +228,203 @@ outputs: qc_dim_corr_plot_png: type: File? outputSource: sc_atac_reduce/qc_dim_corr_plot_png - label: "Correlation plots between QC metrics and cells LSI dimensions" + label: "Correlation between QC metrics and LSI components" doc: | - Correlation plots between QC metrics and cells LSI dimensions. - PNG format + Correlation between QC metrics + and LSI components 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Correlation plots between QC metrics and cells LSI dimensions' + tab: 'QC' + Caption: 'Correlation between QC metrics and LSI components' umap_qc_mtrcs_plot_png: type: File? outputSource: sc_atac_reduce/umap_qc_mtrcs_plot_png - label: "QC metrics on cells UMAP" + label: "UMAP, QC metrics" doc: | - QC metrics on cells UMAP. - PNG format + UMAP, QC metrics 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'QC metrics on cells UMAP' + tab: 'QC' + Caption: 'UMAP, QC metrics' umap_plot_png: type: File? outputSource: sc_atac_reduce/umap_plot_png - label: "Cells UMAP" + label: "UMAP, colored by dataset" doc: | - Cells UMAP. - PNG format + UMAP, colored by dataset 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Cells UMAP' + tab: 'Per dataset' + Caption: 'UMAP, colored by dataset' umap_spl_idnt_plot_png: type: File? outputSource: sc_atac_reduce/umap_spl_idnt_plot_png - label: "Split by dataset cells UMAP" + label: "UMAP, split by dataset" doc: | - Split by dataset cells UMAP. - PNG format + UMAP, split by dataset 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by dataset cells UMAP' + Caption: 'UMAP, split by dataset' umap_spl_frgm_plot_png: type: File? outputSource: sc_atac_reduce/umap_spl_frgm_plot_png - label: "Split by the fragments in peaks per cell counts cells UMAP" + label: "UMAP, colored by dataset, split by fragments in peaks per cell" doc: | - Split by the fragments in peaks per cell counts cells UMAP. - PNG format + UMAP, colored by dataset, split + by fragments in peaks per cell. 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by the fragments in peaks per cell counts cells UMAP' + Caption: 'UMAP, colored by dataset, split by fragments in peaks per cell' umap_spl_peak_plot_png: type: File? outputSource: sc_atac_reduce/umap_spl_peak_plot_png - label: "Split by the peaks per cell counts cells UMAP" + label: "UMAP, colored by dataset, split by peaks per cell" doc: | - Split by the peaks per cell counts cells UMAP. - PNG format + UMAP, colored by dataset, split + by peaks per cell 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by the peaks per cell counts cells UMAP' + Caption: 'UMAP, colored by dataset, split by peaks per cell' umap_spl_tss_plot_png: type: File? outputSource: sc_atac_reduce/umap_spl_tss_plot_png - label: "Split by the TSS enrichment score cells UMAP" + label: "UMAP, colored by dataset, split by TSS enrichment score" doc: | - Split by the TSS enrichment score cells UMAP. - PNG format + UMAP, colored by dataset, split + by TSS enrichment score 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by the TSS enrichment score cells UMAP' + Caption: 'UMAP, colored by dataset, split by TSS enrichment score' umap_spl_ncls_plot_png: type: File? outputSource: sc_atac_reduce/umap_spl_ncls_plot_png - label: "Split by the nucleosome signal cells UMAP" + label: "UMAP, colored by dataset, split by nucleosome signal" doc: | - Split by the nucleosome signal cells UMAP. - PNG format + UMAP, colored by dataset, split + by nucleosome signal 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by the nucleosome signal cells UMAP' + Caption: 'UMAP, colored by dataset, split by nucleosome signal' umap_spl_frip_plot_png: type: File? outputSource: sc_atac_reduce/umap_spl_frip_plot_png - label: "Split by the FRiP cells UMAP" + label: "UMAP, colored by dataset, split by FRiP" doc: | - Split by the FRiP cells UMAP. - PNG format + UMAP, colored by dataset, + split by FRiP 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by the FRiP cells UMAP' + Caption: 'UMAP, colored by dataset, split by FRiP' umap_spl_blck_plot_png: type: File? outputSource: sc_atac_reduce/umap_spl_blck_plot_png - label: "Split by the genomic blacklist regions fraction cells UMAP" + label: "UMAP, colored by dataset, split by blacklist fraction" doc: | - Split by the genomic blacklist regions fraction cells UMAP. - PNG format + UMAP, colored by dataset, split + by blacklist fraction 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by the genomic blacklist regions fraction cells UMAP' + Caption: 'UMAP, colored by dataset, split by blacklist fraction' umap_spl_cnd_plot_png: type: File? outputSource: sc_atac_reduce/umap_spl_cnd_plot_png - label: "Split by grouping condition cells UMAP" + label: "UMAP, colored by dataset, split by grouping condition" doc: | - Split by grouping condition cells UMAP. - PNG format + UMAP, colored by dataset, split + by grouping condition 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'Split by grouping condition cells UMAP' + Caption: 'UMAP, colored by dataset, split by grouping condition' + + umap_gr_cnd_spl_frgm_plot_png: + type: File? + outputSource: sc_atac_reduce/umap_gr_cnd_spl_frgm_plot_png + label: "UMAP, colored by grouping condition, split by fragments in peaks per cell" + doc: | + UMAP, colored by grouping condition, + split by fragments in peaks per cell + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'UMAP, colored by grouping condition, split by fragments in peaks per cell' + + umap_gr_cnd_spl_peak_plot_png: + type: File? + outputSource: sc_atac_reduce/umap_gr_cnd_spl_peak_plot_png + label: "UMAP, colored by grouping condition, split by peaks per cell" + doc: | + UMAP, colored by grouping condition, + split by peaks per cell + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'UMAP, colored by grouping condition, split by peaks per cell' + + umap_gr_cnd_spl_tss_plot_png: + type: File? + outputSource: sc_atac_reduce/umap_gr_cnd_spl_tss_plot_png + label: "UMAP, colored by grouping condition, split by TSS enrichment score" + doc: | + UMAP, colored by grouping condition, + split by TSS enrichment score + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'UMAP, colored by grouping condition, split by TSS enrichment score' + + umap_gr_cnd_spl_ncls_plot_png: + type: File? + outputSource: sc_atac_reduce/umap_gr_cnd_spl_ncls_plot_png + label: "UMAP, colored by grouping condition, split by nucleosome signal" + doc: | + UMAP, colored by grouping condition, + split by nucleosome signal + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'UMAP, colored by grouping condition, split by nucleosome signal' + + umap_gr_cnd_spl_frip_plot_png: + type: File? + outputSource: sc_atac_reduce/umap_gr_cnd_spl_frip_plot_png + label: "UMAP, colored by grouping condition, split by FRiP" + doc: | + UMAP, colored by grouping condition, + split by FRiP + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'UMAP, colored by grouping condition, split by FRiP' + + umap_gr_cnd_spl_blck_plot_png: + type: File? + outputSource: sc_atac_reduce/umap_gr_cnd_spl_blck_plot_png + label: "UMAP, colored by grouping condition, split by blacklist fraction" + doc: | + UMAP, colored by grouping condition, + split by blacklist fraction + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'UMAP, colored by grouping condition, split by blacklist fraction' ucsc_cb_html_data: type: Directory? @@ -482,24 +482,28 @@ steps: integration_method: integration_method integrate_by: source: integrate_by - valueFrom: $(split_features(self)) + valueFrom: | + ${ + if (self == "none") { + return null; + } else if (self == "dataset") { + return "new.ident"; + } else if (self == "condition") { + return "condition"; + } else { + return split_features(self); + } + } minimum_var_peaks_perc: minimum_var_peaks_perc dimensions: dimensions - umap_spread: umap_spread - umap_mindist: umap_mindist - umap_neighbors: umap_neighbors - umap_metric: umap_metric - umap_method: umap_method verbose: default: true export_ucsc_cb: export_ucsc_cb color_theme: color_theme parallel_memory_limit: - source: parallel_memory_limit - valueFrom: $(parseInt(self)) + default: 32 vector_memory_limit: - source: vector_memory_limit - valueFrom: $(parseInt(self)) + default: 96 threads: source: threads valueFrom: $(parseInt(self)) @@ -515,6 +519,12 @@ steps: - umap_spl_ncls_plot_png - umap_spl_frip_plot_png - umap_spl_blck_plot_png + - umap_gr_cnd_spl_frgm_plot_png + - umap_gr_cnd_spl_peak_plot_png + - umap_gr_cnd_spl_tss_plot_png + - umap_gr_cnd_spl_ncls_plot_png + - umap_gr_cnd_spl_frip_plot_png + - umap_gr_cnd_spl_blck_plot_png - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds From e9815fc6bf43bf264c80b457bce3e2d1623398d4 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 13 Jul 2023 12:05:58 -0400 Subject: [PATCH 051/162] Change the name of the workflow --- workflows/cellranger-arc-aggr.cwl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflows/cellranger-arc-aggr.cwl b/workflows/cellranger-arc-aggr.cwl index 2daaf442..d4f364f0 100644 --- a/workflows/cellranger-arc-aggr.cwl +++ b/workflows/cellranger-arc-aggr.cwl @@ -345,8 +345,8 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cell Ranger ARC Aggregate" -s:name: "Cell Ranger ARC Aggregate" +label: "Cell Ranger ARC Aggregate Gene Expression and Chromatin Accessibility" +s:name: "Cell Ranger ARC Aggregate Gene Expression and Chromatin Accessibility" s:alternateName: "Aggregates data from multiple Cell Ranger ARC Count Gene Expression and Chromatin Accessibility experiments" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/cellranger-arc-aggr.cwl @@ -385,7 +385,7 @@ s:creator: doc: | - Cell Ranger ARC Aggregate + Cell Ranger ARC Aggregate Gene Expression and Chromatin Accessibility Aggregates data from multiple Cell Ranger ARC Count Gene Expression and Chromatin Accessibility experiments. From 6c1cfce7ea5f06e244f0d6c68c4a75488b3f9735 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 13 Jul 2023 15:02:58 -0400 Subject: [PATCH 052/162] Refactore sc-atac-cluster workflow --- tools/sc-atac-cluster.cwl | 58 ++++-- workflows/sc-atac-cluster.cwl | 347 ++++++++++++++-------------------- workflows/sc-rna-cluster.cwl | 65 ++----- 3 files changed, 190 insertions(+), 280 deletions(-) diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index f0449b20..f3fa297b 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -258,7 +258,7 @@ outputs: outputBinding: glob: "*_umap_res_*.png" doc: | - Clustered cells UMAP. + UMAP, colored by cluster. PNG format umap_res_plot_pdf: @@ -269,7 +269,7 @@ outputs: outputBinding: glob: "*_umap_res_*.pdf" doc: | - Clustered cells UMAP. + UMAP, colored by cluster. PDF format slh_res_plot_png: @@ -280,7 +280,7 @@ outputs: outputBinding: glob: "*_slh_res_*.png" doc: | - Silhouette scores. Downsampled to max 500 cells per cluster. + Silhouette scores. PNG format slh_res_plot_pdf: @@ -291,7 +291,7 @@ outputs: outputBinding: glob: "*_slh_res_*.pdf" doc: | - Silhouette scores. Downsampled to max 500 cells per cluster. + Silhouette scores. PDF format umap_spl_idnt_res_plot_png: @@ -302,7 +302,8 @@ outputs: outputBinding: glob: "*_umap_spl_idnt_res_*.png" doc: | - Split by dataset clustered cells UMAP. + UMAP, colored by cluster, + split by dataset. PNG format umap_spl_idnt_res_plot_pdf: @@ -313,7 +314,8 @@ outputs: outputBinding: glob: "*_umap_spl_idnt_res_*.pdf" doc: | - Split by dataset clustered cells UMAP. + UMAP, colored by cluster, + split by dataset. PDF format cmp_gr_clst_spl_idnt_res_plot_png: @@ -324,7 +326,9 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_idnt_res_*.png" doc: | - Grouped by cluster split by dataset cells composition plot. Downsampled. + Composition plot, colored by + cluster, split by dataset, + downsampled. PNG format cmp_gr_clst_spl_idnt_res_plot_pdf: @@ -335,7 +339,9 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_idnt_res_*.pdf" doc: | - Grouped by cluster split by dataset cells composition plot. Downsampled. + Composition plot, colored by + cluster, split by dataset, + downsampled. PDF format cmp_gr_idnt_spl_clst_res_plot_png: @@ -346,7 +352,9 @@ outputs: outputBinding: glob: "*_cmp_gr_idnt_spl_clst_res_*.png" doc: | - Grouped by dataset split by cluster cells composition plot. Downsampled. + Composition plot, colored by + dataset, split by cluster, + downsampled. PNG format cmp_gr_idnt_spl_clst_res_plot_pdf: @@ -357,7 +365,9 @@ outputs: outputBinding: glob: "*_cmp_gr_idnt_spl_clst_res_*.pdf" doc: | - Grouped by dataset split by cluster cells composition plot. Downsampled. + Composition plot, colored by + dataset, split by cluster, + downsampled. PDF format umap_spl_cnd_res_plot_png: @@ -368,7 +378,8 @@ outputs: outputBinding: glob: "*_umap_spl_cnd_res_*.png" doc: | - Split by grouping condition clustered cells UMAP. + UMAP, colored by cluster, split + by grouping condition. PNG format umap_spl_cnd_res_plot_pdf: @@ -379,7 +390,8 @@ outputs: outputBinding: glob: "*_umap_spl_cnd_res_*.pdf" doc: | - Split by grouping condition clustered cells UMAP. + UMAP, colored by cluster, split + by grouping condition. PDF format cmp_gr_clst_spl_cnd_res_plot_png: @@ -390,7 +402,9 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_cnd_res_*.png" doc: | - Grouped by cluster split by condition cells composition plot. Downsampled. + Composition plot, colored by + cluster, split by grouping + condition, downsampled. PNG format cmp_gr_clst_spl_cnd_res_plot_pdf: @@ -401,7 +415,9 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_cnd_res_*.pdf" doc: | - Grouped by cluster split by condition cells composition plot. Downsampled. + Composition plot, colored by + cluster, split by grouping + condition, downsampled. PDF format cmp_gr_cnd_spl_clst_res_plot_png: @@ -412,7 +428,9 @@ outputs: outputBinding: glob: "*_cmp_gr_cnd_spl_clst_res_*.png" doc: | - Grouped by condition split by cluster cells composition plot. Downsampled. + Composition plot, colored by + grouping condition, split by + cluster, downsampled. PNG format cmp_gr_cnd_spl_clst_res_plot_pdf: @@ -423,7 +441,9 @@ outputs: outputBinding: glob: "*_cmp_gr_cnd_spl_clst_res_*.pdf" doc: | - Grouped by condition split by cluster cells composition plot. Downsampled. + Composition plot, colored by + grouping condition, split by + cluster, downsampled. PDF format cvrg_res_plot_png: @@ -434,7 +454,7 @@ outputs: outputBinding: glob: "*_cvrg_res_*.png" doc: | - Tn5 insertion frequency plot around gene. + Fragments coverage. PNG format cvrg_res_plot_pdf: @@ -445,7 +465,7 @@ outputs: outputBinding: glob: "*_cvrg_res_*.pdf" doc: | - Tn5 insertion frequency plot around gene. + Fragments coverage. PDF format peak_markers_tsv: @@ -453,7 +473,7 @@ outputs: outputBinding: glob: "*_peak_markers.tsv" doc: | - Differentially accessible peaks between each pair of clusters for all resolutions. + Peak markers per cluster for all resolutions. TSV format ucsc_cb_config_data: diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index 1983f8e2..157cf853 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -36,111 +36,88 @@ inputs: alias: type: string - label: "Experiment short name/alias" + label: "Analysis name" sd:preview: position: 1 query_data_rds: type: File - label: "Experiment run through Single-cell ATAC-Seq Dimensionality Reduction Analysis" + label: "Single-cell Analysis with LSI Transformed ATAC-Seq Datasets" doc: | - Path to the RDS file to load Seurat object from. This file should include - chromatin accessibility information stored in the ATAC assay, as well as - 'atac_lsi' and 'atacumap' dimensionality reductions applied to that assay. + Analysis that includes single-cell + multiome ATAC and RNA-Seq or just + ATAC-Seq datasets run through "Single-cell + ATAC-Seq Dimensionality Reduction Analysis" + at any of the processing stages. 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" 'sd:localLabel': true + atac_fragments_file: + type: File? + secondaryFiles: + - .tbi + label: "Cell Ranger ARC Sample (optional)" + doc: | + "Cell Ranger ARC Sample" for generating + fragments coverage plots over the genes + of interest. + 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" + 'sd:localLabel': true + dimensions: type: int? default: 40 - label: "Dimensionality to use when constructing nearest-neighbor graph before clustering (from 1 to 50)" + label: "Target dimensionality" doc: | - Dimensionality to use when constructing nearest-neighbor graph before clustering - (from 1 to 50). If single value N is provided, use from 2 to N dimensions. If - multiple values are provided, subset to only selected dimensions. - Default: from 2 to 10 - - cluster_algorithm: - type: - - "null" - - type: enum - symbols: - - "louvain" - - "mult-louvain" - - "slm" - - "leiden" - default: "slm" - label: "Algorithm for modularity optimization when running clustering" - doc: | - Algorithm for modularity optimization when running clustering. - Default: slm + Number of LSI components to be used + in constructing nearest-neighbor graph + as part of the clustering algorithm. + Accepted values range from 2 to 50. + First dimension is always excluded + Default: 40 resolution: type: float? default: 0.3 label: "Clustering resolution" doc: | - Clustering resolution applied to the constructed nearest-neighbor graph. - Can be set as an array but only the first item from the list will be used - for cluster labels and peak markers in the UCSC Cell Browser when running - with --cbbuild and --diffpeaks parameters. - Default: 0.3, 0.5, 1.0 - - atac_fragments_file: - type: File? - secondaryFiles: - - .tbi - label: "Cell Ranger ARC Count/Aggregate Experiment" - doc: | - Count and barcode information for every ATAC fragment used in the loaded Seurat - object. File should be saved in TSV format with tbi-index file. - 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" - - genes_of_interest: - type: string? - default: null - label: "Genes of interest to build Tn5 insertion frequency plots for the nearest peaks" - doc: | - Genes of interest to build Tn5 insertion frequency plots for the nearest peaks. - If loaded Seurat object includes genes expression information in the RNA assay - it will be additionally shown on the right side of the plots. - Ignored if '--fragments' is not provided. - Default: None + Resolution to define the "granularity" + of the clustered data. Larger values + lead to a bigger number of clusters. + Optimal resolution often increases + with the number of cells. + Default: 0.3 identify_diff_peaks: type: boolean? default: false - label: "Identify differentially accessible peaks between each pair of clusters" + label: "Find peak markers" doc: | - Identify differentially accessible peaks between each pair of clusters for all resolutions. + Identify differentially accessible + peaks in each cluster compared to + all other cells. Include only peaks + that are present in at least 5% of + the cells coming from either current + cluster or from all other clusters + together. Exclude cells with + log2FoldChange values less than 0.25. + Use logistic regression framework to + calculate P-values. Keep only genes + with P-values lower than 0.01. Adjust + P-values for multiple comparisons + using Bonferroni correction. Default: false - 'sd:layout': - advanced: true - - minimum_logfc: - type: float? - default: 0.25 - label: "Include only those peaks that on average have log fold change difference in the chromatin accessibility between every tested pair of clusters not lower than this value" - doc: | - For differentially accessible peaks identification include only those peaks that - on average have log fold change difference in the chromatin accessibility between - every tested pair of clusters not lower than this value. Ignored if '--diffpeaks' - is not set. - Default: 0.25 - 'sd:layout': - advanced: true - minimum_pct: - type: float? - default: 0.05 - label: "Include only those peaks that are detected in not lower than this fraction of cells in either of the two tested clusters" + genes_of_interest: + type: string? + default: null + label: "Genes of interest" doc: | - For differentially accessible peaks identification include only those peaks that - are detected in not lower than this fraction of cells in either of the two tested - clusters. Ignored if '--diffpeaks' is not set. - Default: 0.05 - 'sd:layout': - advanced: true + Comma or space separated list of genes + of interest to generate fragments coverage + plots. Ignored if "Cell Ranger ARC Sample" + input is not provided. + Default: None color_theme: type: @@ -156,41 +133,12 @@ inputs: - "classic" - "void" default: "classic" - label: "Color theme for all generated plots" + label: "Plots color theme" doc: | - Color theme for all generated plots. One of gray, bw, linedraw, light, - dark, minimal, classic, void. + Color theme for all plots saved + as PNG files. Default: classic - 'sd:layout': - advanced: true - - parallel_memory_limit: - type: - - "null" - - type: enum - symbols: - - "32" - default: "32" - label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" - doc: | - Maximum memory in GB allowed to be shared between the workers - when using multiple --cpus. - Forced to 32 GB - 'sd:layout': - advanced: true - - vector_memory_limit: - type: - - "null" - - type: enum - symbols: - - "64" - default: "64" - label: "Maximum vector memory in GB allowed to be used by R" - doc: | - Maximum vector memory in GB allowed to be used by R. - Forced to 64 GB - 'sd:layout': + "sd:layout": advanced: true threads: @@ -199,136 +147,117 @@ inputs: - type: enum symbols: - "1" + - "2" default: "1" - label: "Number of cores/cpus to use" + label: "Cores/CPUs number" doc: | - Number of cores/cpus to use - Forced to 1 - 'sd:layout': + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 1 + "sd:layout": advanced: true outputs: umap_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_atac_cluster/umap_res_plot_png - label: "Clustered cells UMAP" + label: "UMAP, colored by cluster" doc: | - Clustered cells UMAP. - PNG format + UMAP, colored by cluster 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Clustered cells UMAP' + tab: 'Per cluster' + Caption: 'UMAP, colored by cluster' slh_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_atac_cluster/slh_res_plot_png - label: "Silhouette scores. Downsampled to max 500 cells per cluster." + label: "Silhouette scores" doc: | - Silhouette scores. Downsampled to max 500 cells per cluster. - PNG format + Silhouette scores 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Silhouette scores. Downsampled to max 500 cells per cluster.' + tab: 'Per cluster' + Caption: 'Silhouette scores' umap_spl_idnt_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_atac_cluster/umap_spl_idnt_res_plot_png - label: "Split by dataset clustered cells UMAP" + label: "UMAP, colored by cluster, split by dataset" doc: | - Split by dataset clustered cells UMAP. - PNG format + UMAP, colored by cluster, + split by dataset 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by dataset clustered cells UMAP' + Caption: 'UMAP, colored by cluster, split by dataset' cmp_gr_clst_spl_idnt_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_atac_cluster/cmp_gr_clst_spl_idnt_res_plot_png - label: "Grouped by cluster split by dataset cells composition plot. Downsampled." + label: "Composition plot, colored by cluster, split by dataset, downsampled" doc: | - Grouped by cluster split by dataset cells composition plot. Downsampled. - PNG format + Composition plot, colored by + cluster, split by dataset, + downsampled 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Grouped by cluster split by dataset cells composition plot. Downsampled.' + tab: 'Per dataset' + Caption: 'Composition plot, colored by cluster, split by dataset, downsampled' cmp_gr_idnt_spl_clst_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_atac_cluster/cmp_gr_idnt_spl_clst_res_plot_png - label: "Grouped by dataset split by cluster cells composition plot. Downsampled." + label: "Composition plot, colored by dataset, split by cluster, downsampled" doc: | - Grouped by dataset split by cluster cells composition plot. Downsampled. - PNG format + Composition plot, colored by + dataset, split by cluster, + downsampled 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Grouped by dataset split by cluster cells composition plot. Downsampled.' + tab: 'Per dataset' + Caption: 'Composition plot, colored by dataset, split by cluster, downsampled' umap_spl_cnd_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_atac_cluster/umap_spl_cnd_res_plot_png - label: "Split by grouping condition clustered cells UMAP" + label: "UMAP, colored by cluster, split by grouping condition" doc: | - Split by grouping condition clustered cells UMAP. - PNG format + UMAP, colored by cluster, split + by grouping condition 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'Split by grouping condition clustered cells UMAP' + Caption: 'UMAP, colored by cluster, split by grouping condition' cmp_gr_clst_spl_cnd_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_atac_cluster/cmp_gr_clst_spl_cnd_res_plot_png - label: "Grouped by cluster split by condition cells composition plot. Downsampled." + label: "Composition plot, colored by cluster, split by grouping condition, downsampled" doc: | - Grouped by cluster split by condition cells composition plot. Downsampled. - PNG format + Composition plot, colored by + cluster, split by grouping + condition, downsampled 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'Grouped by cluster split by condition cells composition plot. Downsampled.' + Caption: 'Composition plot, colored by cluster, split by grouping condition, downsampled' cmp_gr_cnd_spl_clst_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_atac_cluster/cmp_gr_cnd_spl_clst_res_plot_png - label: "Grouped by condition split by cluster cells composition plot. Downsampled." + label: "Composition plot, colored by grouping condition, split by cluster, downsampled" doc: | - Grouped by condition split by cluster cells composition plot. Downsampled. - PNG format + Composition plot, colored by + grouping condition, split by + cluster, downsampled 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'Grouped by condition split by cluster cells composition plot. Downsampled.' + Caption: 'Composition plot, colored by grouping condition, split by cluster, downsampled' cvrg_res_plot_png: type: @@ -336,43 +265,42 @@ outputs: - type: array items: File outputSource: sc_atac_cluster/cvrg_res_plot_png - label: "Tn5 insertion frequency plot around gene" + label: "Fragments coverage" doc: | - Tn5 insertion frequency plot around gene. - PNG format + Fragments coverage 'sd:visualPlugins': - image: tab: 'Genome coverage' - Caption: 'Tn5 insertion frequency plot around gene' + Caption: 'Fragments coverage' peak_markers_tsv: type: File? outputSource: sc_atac_cluster/peak_markers_tsv - label: "Differentially accessible peaks between each pair of clusters" + label: "Peak markers per cluster for all resolutions" doc: | - Differentially accessible peaks between each pair of clusters for all resolutions. - TSV format + Peak markers per cluster for all resolutions 'sd:visualPlugins': - syncfusiongrid: - tab: 'Diff. peaks' - Title: 'Differentially accessible peaks between each pair of clusters' + tab: 'Peak markers' + Title: 'Peak markers per cluster for all resolutions' ucsc_cb_html_data: - type: Directory + type: Directory? outputSource: sc_atac_cluster/ucsc_cb_html_data - label: "Directory with UCSC Cellbrowser html data" + label: "UCSC Cell Browser data" doc: | - Directory with UCSC Cellbrowser html data. + Directory with UCSC Cell Browser + data ucsc_cb_html_file: - type: File + type: File? outputSource: sc_atac_cluster/ucsc_cb_html_file - label: "Open in UCSC Cell Browser" + label: "UCSC Cell Browser" doc: | - HTML index file from the directory with UCSC Cellbrowser html data. - 'sd:visualPlugins': + UCSC Cell Browser HTML index file + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" seurat_data_rds: @@ -409,15 +337,18 @@ steps: dimensions: dimensions cluster_metric: default: euclidean - cluster_algorithm: cluster_algorithm + cluster_algorithm: + default: "slm" resolution: resolution atac_fragments_file: atac_fragments_file genes_of_interest: source: genes_of_interest valueFrom: $(split_features(self)) identify_diff_peaks: identify_diff_peaks - minimum_logfc: minimum_logfc - minimum_pct: minimum_pct + minimum_logfc: + default: 0.25 + minimum_pct: + default: 0.05 test_to_use: default: LR verbose: @@ -426,11 +357,9 @@ steps: default: true color_theme: color_theme parallel_memory_limit: - source: parallel_memory_limit - valueFrom: $(parseInt(self)) + default: 32 vector_memory_limit: - source: vector_memory_limit - valueFrom: $(parseInt(self)) + default: 96 threads: source: threads valueFrom: $(parseInt(self)) @@ -500,5 +429,5 @@ s:creator: doc: | Single-cell ATAC-Seq Cluster Analysis - Clusters single-cell ATAC-Seq datasets, identifies differentially - accessible peaks. \ No newline at end of file + Clusters single-cell ATAC-Seq datasets, identifies + differentially accessible peaks. \ No newline at end of file diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index 027c5f55..52fee49c 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -147,10 +147,7 @@ inputs: outputs: umap_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_rna_cluster/umap_res_plot_png label: "UMAP, colored by cluster" doc: | @@ -161,10 +158,7 @@ outputs: Caption: 'UMAP, colored by cluster' slh_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_rna_cluster/slh_res_plot_png label: "Silhouette scores" doc: | @@ -175,10 +169,7 @@ outputs: Caption: 'Silhouette scores' umap_spl_ph_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_rna_cluster/umap_spl_ph_res_plot_png label: "UMAP, colored by cluster, split by cell cycle phase" doc: | @@ -190,10 +181,7 @@ outputs: Caption: 'UMAP, colored by cluster, split by cell cycle phase' cmp_gr_ph_spl_clst_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_rna_cluster/cmp_gr_ph_spl_clst_res_plot_png label: "Composition plot, colored by cell cycle phase, split by cluster, downsampled" doc: | @@ -206,10 +194,7 @@ outputs: Caption: 'Composition plot, colored by cell cycle phase, split by cluster, downsampled' umap_spl_idnt_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_rna_cluster/umap_spl_idnt_res_plot_png label: "UMAP, colored by cluster, split by dataset" doc: | @@ -221,10 +206,7 @@ outputs: Caption: 'UMAP, colored by cluster, split by dataset' cmp_gr_clst_spl_idnt_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_rna_cluster/cmp_gr_clst_spl_idnt_res_plot_png label: "Composition plot, colored by cluster, split by dataset, downsampled" doc: | @@ -237,10 +219,7 @@ outputs: Caption: 'Composition plot, colored by cluster, split by dataset, downsampled' cmp_gr_idnt_spl_clst_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_rna_cluster/cmp_gr_idnt_spl_clst_res_plot_png label: "Composition plot, colored by dataset, split by cluster, downsampled" doc: | @@ -266,10 +245,7 @@ outputs: Caption: 'Composition plot, colored by cell cycle phase, split by dataset, downsampled' umap_spl_cnd_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_rna_cluster/umap_spl_cnd_res_plot_png label: "UMAP, colored by cluster, split by grouping condition" doc: | @@ -281,10 +257,7 @@ outputs: Caption: 'UMAP, colored by cluster, split by grouping condition' cmp_gr_clst_spl_cnd_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_rna_cluster/cmp_gr_clst_spl_cnd_res_plot_png label: "Composition plot, colored by cluster, split by grouping condition, downsampled" doc: | @@ -297,10 +270,7 @@ outputs: Caption: 'Composition plot, colored by cluster, split by grouping condition, downsampled' cmp_gr_cnd_spl_clst_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_rna_cluster/cmp_gr_cnd_spl_clst_res_plot_png label: "Composition plot, colored by grouping condition, split by cluster, downsampled" doc: | @@ -313,10 +283,7 @@ outputs: Caption: 'Composition plot, colored by grouping condition, split by cluster, downsampled' xpr_avg_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_rna_cluster/xpr_avg_res_plot_png label: "Gene expression dot plot" doc: | @@ -327,10 +294,7 @@ outputs: Caption: 'Gene expression dot plot' xpr_dnst_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_rna_cluster/xpr_dnst_res_plot_png label: "Gene expression violin plot" doc: | @@ -369,10 +333,7 @@ outputs: Caption: 'UMAP, gene expression density' xpr_htmp_res_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: sc_rna_cluster/xpr_htmp_res_plot_png label: "Gene expression heatmap" doc: | From 483b229445fc4c95488f1d9af29237b3ca82bbb2 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 13 Jul 2023 15:52:11 -0400 Subject: [PATCH 053/162] Fix bug with array type output --- workflows/sc-atac-cluster.cwl | 40 ++++++++++++++++----- workflows/sc-rna-cluster.cwl | 65 ++++++++++++++++++++++++++++------- 2 files changed, 84 insertions(+), 21 deletions(-) diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index 157cf853..d6d04b83 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -162,7 +162,10 @@ inputs: outputs: umap_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_atac_cluster/umap_res_plot_png label: "UMAP, colored by cluster" doc: | @@ -173,7 +176,10 @@ outputs: Caption: 'UMAP, colored by cluster' slh_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_atac_cluster/slh_res_plot_png label: "Silhouette scores" doc: | @@ -184,7 +190,10 @@ outputs: Caption: 'Silhouette scores' umap_spl_idnt_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_atac_cluster/umap_spl_idnt_res_plot_png label: "UMAP, colored by cluster, split by dataset" doc: | @@ -196,7 +205,10 @@ outputs: Caption: 'UMAP, colored by cluster, split by dataset' cmp_gr_clst_spl_idnt_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_atac_cluster/cmp_gr_clst_spl_idnt_res_plot_png label: "Composition plot, colored by cluster, split by dataset, downsampled" doc: | @@ -209,7 +221,10 @@ outputs: Caption: 'Composition plot, colored by cluster, split by dataset, downsampled' cmp_gr_idnt_spl_clst_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_atac_cluster/cmp_gr_idnt_spl_clst_res_plot_png label: "Composition plot, colored by dataset, split by cluster, downsampled" doc: | @@ -222,7 +237,10 @@ outputs: Caption: 'Composition plot, colored by dataset, split by cluster, downsampled' umap_spl_cnd_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_atac_cluster/umap_spl_cnd_res_plot_png label: "UMAP, colored by cluster, split by grouping condition" doc: | @@ -234,7 +252,10 @@ outputs: Caption: 'UMAP, colored by cluster, split by grouping condition' cmp_gr_clst_spl_cnd_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_atac_cluster/cmp_gr_clst_spl_cnd_res_plot_png label: "Composition plot, colored by cluster, split by grouping condition, downsampled" doc: | @@ -247,7 +268,10 @@ outputs: Caption: 'Composition plot, colored by cluster, split by grouping condition, downsampled' cmp_gr_cnd_spl_clst_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_atac_cluster/cmp_gr_cnd_spl_clst_res_plot_png label: "Composition plot, colored by grouping condition, split by cluster, downsampled" doc: | diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index 52fee49c..027c5f55 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -147,7 +147,10 @@ inputs: outputs: umap_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_rna_cluster/umap_res_plot_png label: "UMAP, colored by cluster" doc: | @@ -158,7 +161,10 @@ outputs: Caption: 'UMAP, colored by cluster' slh_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_rna_cluster/slh_res_plot_png label: "Silhouette scores" doc: | @@ -169,7 +175,10 @@ outputs: Caption: 'Silhouette scores' umap_spl_ph_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_rna_cluster/umap_spl_ph_res_plot_png label: "UMAP, colored by cluster, split by cell cycle phase" doc: | @@ -181,7 +190,10 @@ outputs: Caption: 'UMAP, colored by cluster, split by cell cycle phase' cmp_gr_ph_spl_clst_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_rna_cluster/cmp_gr_ph_spl_clst_res_plot_png label: "Composition plot, colored by cell cycle phase, split by cluster, downsampled" doc: | @@ -194,7 +206,10 @@ outputs: Caption: 'Composition plot, colored by cell cycle phase, split by cluster, downsampled' umap_spl_idnt_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_rna_cluster/umap_spl_idnt_res_plot_png label: "UMAP, colored by cluster, split by dataset" doc: | @@ -206,7 +221,10 @@ outputs: Caption: 'UMAP, colored by cluster, split by dataset' cmp_gr_clst_spl_idnt_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_rna_cluster/cmp_gr_clst_spl_idnt_res_plot_png label: "Composition plot, colored by cluster, split by dataset, downsampled" doc: | @@ -219,7 +237,10 @@ outputs: Caption: 'Composition plot, colored by cluster, split by dataset, downsampled' cmp_gr_idnt_spl_clst_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_rna_cluster/cmp_gr_idnt_spl_clst_res_plot_png label: "Composition plot, colored by dataset, split by cluster, downsampled" doc: | @@ -245,7 +266,10 @@ outputs: Caption: 'Composition plot, colored by cell cycle phase, split by dataset, downsampled' umap_spl_cnd_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_rna_cluster/umap_spl_cnd_res_plot_png label: "UMAP, colored by cluster, split by grouping condition" doc: | @@ -257,7 +281,10 @@ outputs: Caption: 'UMAP, colored by cluster, split by grouping condition' cmp_gr_clst_spl_cnd_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_rna_cluster/cmp_gr_clst_spl_cnd_res_plot_png label: "Composition plot, colored by cluster, split by grouping condition, downsampled" doc: | @@ -270,7 +297,10 @@ outputs: Caption: 'Composition plot, colored by cluster, split by grouping condition, downsampled' cmp_gr_cnd_spl_clst_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_rna_cluster/cmp_gr_cnd_spl_clst_res_plot_png label: "Composition plot, colored by grouping condition, split by cluster, downsampled" doc: | @@ -283,7 +313,10 @@ outputs: Caption: 'Composition plot, colored by grouping condition, split by cluster, downsampled' xpr_avg_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_rna_cluster/xpr_avg_res_plot_png label: "Gene expression dot plot" doc: | @@ -294,7 +327,10 @@ outputs: Caption: 'Gene expression dot plot' xpr_dnst_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_rna_cluster/xpr_dnst_res_plot_png label: "Gene expression violin plot" doc: | @@ -333,7 +369,10 @@ outputs: Caption: 'UMAP, gene expression density' xpr_htmp_res_plot_png: - type: File? + type: + - "null" + - type: array + items: File outputSource: sc_rna_cluster/xpr_htmp_res_plot_png label: "Gene expression heatmap" doc: | From 0c440356c01679a8b3e1ca8956019e62ffeadedd Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 13 Jul 2023 17:34:09 -0400 Subject: [PATCH 054/162] Refactor sc-wnn-cluster workflow --- tools/sc-wnn-cluster.cwl | 100 ++++-- workflows/sc-atac-cluster.cwl | 9 +- workflows/sc-rna-cluster.cwl | 9 +- workflows/sc-wnn-cluster.cwl | 600 +++++++++++++--------------------- 4 files changed, 309 insertions(+), 409 deletions(-) diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index 4c656b41..bcee379e 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -405,7 +405,7 @@ outputs: outputBinding: glob: "*_umap_res_*.png" doc: | - Clustered cells UMAP. + UMAP, colored by cluster. PNG format umap_res_plot_pdf: @@ -416,7 +416,7 @@ outputs: outputBinding: glob: "*_umap_res_*.pdf" doc: | - Clustered cells UMAP. + UMAP, colored by cluster. PDF format umap_spl_idnt_res_plot_png: @@ -427,7 +427,8 @@ outputs: outputBinding: glob: "*_umap_spl_idnt_res_*.png" doc: | - Split by dataset clustered cells UMAP. + UMAP, colored by cluster, + split by dataset. PNG format umap_spl_idnt_res_plot_pdf: @@ -438,7 +439,8 @@ outputs: outputBinding: glob: "*_umap_spl_idnt_res_*.pdf" doc: | - Split by dataset clustered cells UMAP. + UMAP, colored by cluster, + split by dataset. PDF format cmp_gr_clst_spl_idnt_res_plot_png: @@ -449,7 +451,9 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_idnt_res_*.png" doc: | - Grouped by cluster split by dataset cells composition plot. Downsampled. + Composition plot, colored by + cluster, split by dataset, + downsampled. PNG format cmp_gr_clst_spl_idnt_res_plot_pdf: @@ -460,7 +464,9 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_idnt_res_*.pdf" doc: | - Grouped by cluster split by dataset cells composition plot. Downsampled. + Composition plot, colored by + cluster, split by dataset, + downsampled. PDF format cmp_gr_idnt_spl_clst_res_plot_png: @@ -471,7 +477,9 @@ outputs: outputBinding: glob: "*_cmp_gr_idnt_spl_clst_res_*.png" doc: | - Grouped by dataset split by cluster cells composition plot. Downsampled. + Composition plot, colored by + dataset, split by cluster, + downsampled. PNG format cmp_gr_idnt_spl_clst_res_plot_pdf: @@ -482,7 +490,9 @@ outputs: outputBinding: glob: "*_cmp_gr_idnt_spl_clst_res_*.pdf" doc: | - Grouped by dataset split by cluster cells composition plot. Downsampled. + Composition plot, colored by + dataset, split by cluster, + downsampled. PDF format umap_spl_cnd_res_plot_png: @@ -493,7 +503,8 @@ outputs: outputBinding: glob: "*_umap_spl_cnd_res_*.png" doc: | - Split by grouping condition clustered cells UMAP. + UMAP, colored by cluster, + split by grouping condition. PNG format umap_spl_cnd_res_plot_pdf: @@ -504,7 +515,8 @@ outputs: outputBinding: glob: "*_umap_spl_cnd_res_*.pdf" doc: | - Split by grouping condition clustered cells UMAP. + UMAP, colored by cluster, + split by grouping condition. PDF format cmp_gr_clst_spl_cnd_res_plot_png: @@ -515,7 +527,9 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_cnd_res_*.png" doc: | - Grouped by cluster split by condition cells composition plot. Downsampled. + Composition plot, colored by + cluster, split by grouping + condition, downsampled. PNG format cmp_gr_clst_spl_cnd_res_plot_pdf: @@ -526,7 +540,9 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_cnd_res_*.pdf" doc: | - Grouped by cluster split by condition cells composition plot. Downsampled. + Composition plot, colored by + cluster, split by grouping + condition, downsampled. PDF format cmp_gr_cnd_spl_clst_res_plot_png: @@ -537,7 +553,9 @@ outputs: outputBinding: glob: "*_cmp_gr_cnd_spl_clst_res_*.png" doc: | - Grouped by condition split by cluster cells composition plot. Downsampled. + Composition plot, colored by + grouping condition, split by + cluster, downsampled. PNG format cmp_gr_cnd_spl_clst_res_plot_pdf: @@ -548,7 +566,9 @@ outputs: outputBinding: glob: "*_cmp_gr_cnd_spl_clst_res_*.pdf" doc: | - Grouped by condition split by cluster cells composition plot. Downsampled. + Composition plot, colored by + grouping condition, split by + cluster, downsampled. PDF format umap_spl_ph_res_plot_png: @@ -559,7 +579,8 @@ outputs: outputBinding: glob: "*_umap_spl_ph_res_*.png" doc: | - Split by cell cycle phase clustered cells UMAP. + UMAP, colored by cluster, + split by cell cycle phase. PNG format umap_spl_ph_res_plot_pdf: @@ -570,7 +591,8 @@ outputs: outputBinding: glob: "*_umap_spl_ph_res_*.pdf" doc: | - Split by cell cycle phase clustered cells UMAP. + UMAP, colored by cluster, + split by cell cycle phase. PDF format cmp_gr_ph_spl_idnt_plot_png: @@ -578,7 +600,9 @@ outputs: outputBinding: glob: "*_cmp_gr_ph_spl_idnt.png" doc: | - Grouped by cell cycle phase split by dataset cells composition plot. Downsampled. + Composition plot, colored by + cell cycle phase, split by + dataset, downsampled. PNG format cmp_gr_ph_spl_idnt_plot_pdf: @@ -586,7 +610,9 @@ outputs: outputBinding: glob: "*_cmp_gr_ph_spl_idnt.pdf" doc: | - Grouped by cell cycle phase split by dataset cells composition plot. Downsampled. + Composition plot, colored by + cell cycle phase, split by + dataset, downsampled. PDF format cmp_gr_ph_spl_clst_res_plot_png: @@ -597,7 +623,9 @@ outputs: outputBinding: glob: "*_cmp_gr_ph_spl_clst_res_*.png" doc: | - Grouped by cell cycle phase split by cluster cells composition plot. Downsampled. + Composition plot, colored by + cell cycle phase, split by + cluster, downsampled. PNG format cmp_gr_ph_spl_clst_res_plot_pdf: @@ -608,7 +636,9 @@ outputs: outputBinding: glob: "*_cmp_gr_ph_spl_clst_res_*.pdf" doc: | - Grouped by cell cycle phase split by cluster cells composition plot. Downsampled. + Composition plot, colored by + cell cycle phase, split by + cluster, downsampled. PDF format xpr_avg_res_plot_png: @@ -619,7 +649,7 @@ outputs: outputBinding: glob: "*_xpr_avg_res_*.png" doc: | - Log normalized scaled average gene expression per cluster. + Gene expression dot plot. PNG format xpr_avg_res_plot_pdf: @@ -630,7 +660,7 @@ outputs: outputBinding: glob: "*_xpr_avg_res_*.pdf" doc: | - Log normalized scaled average gene expression per cluster. + Gene expression dot plot. PDF format xpr_per_cell_plot_png: @@ -641,7 +671,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_[!sgnl_]*.png" doc: | - Log normalized gene expression on cells UMAP. + UMAP, gene expression. PNG format xpr_per_cell_plot_pdf: @@ -652,7 +682,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_[!sgnl_]*.pdf" doc: | - Log normalized gene expression on cells UMAP. + UMAP, gene expression. PDF format xpr_per_cell_sgnl_plot_png: @@ -663,7 +693,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_sgnl_*.png" doc: | - Log normalized gene expression density on cells UMAP. + UMAP, gene expression density. PNG format xpr_per_cell_sgnl_plot_pdf: @@ -674,7 +704,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_sgnl_*.pdf" doc: | - Log normalized gene expression density on cells UMAP. + UMAP, gene expression density. PDF format xpr_dnst_res_plot_png: @@ -685,7 +715,7 @@ outputs: outputBinding: glob: "*_xpr_dnst_res_*.png" doc: | - Log normalized gene expression density per cluster. + Gene expression violin plot. PNG format xpr_dnst_res_plot_pdf: @@ -696,7 +726,7 @@ outputs: outputBinding: glob: "*_xpr_dnst_res_*.pdf" doc: | - Log normalized gene expression density per cluster. + Gene expression violin plot. PDF format cvrg_res_plot_png: @@ -707,7 +737,7 @@ outputs: outputBinding: glob: "*_cvrg_res_*.png" doc: | - Tn5 insertion frequency plot around gene. + Fragments coverage. PNG format cvrg_res_plot_pdf: @@ -718,7 +748,7 @@ outputs: outputBinding: glob: "*_cvrg_res_*.pdf" doc: | - Tn5 insertion frequency plot around gene. + Fragments coverage. PDF format xpr_htmp_res_plot_png: @@ -729,7 +759,7 @@ outputs: outputBinding: glob: "*_xpr_htmp_res_*.png" doc: | - Normalized gene expression heatmap grouped by cluster. + Gene expression heatmap. PNG format xpr_htmp_res_plot_pdf: @@ -740,7 +770,7 @@ outputs: outputBinding: glob: "*_xpr_htmp_res_*.pdf" doc: | - Normalized gene expression heatmap grouped by cluster. + Gene expression heatmap. PDF format gene_markers_tsv: @@ -748,7 +778,8 @@ outputs: outputBinding: glob: "*_gene_markers.tsv" doc: | - Differentially expressed genes between each pair of clusters for all resolutions. + Gene markers per cluster for + all resolutions. TSV format peak_markers_tsv: @@ -756,7 +787,8 @@ outputs: outputBinding: glob: "*_peak_markers.tsv" doc: | - Differentially accessible peaks between each pair of clusters for all resolutions. + Peak markers per cluster for + all resolutions. TSV format ucsc_cb_config_data: diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index d6d04b83..cac807ab 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -45,10 +45,11 @@ inputs: label: "Single-cell Analysis with LSI Transformed ATAC-Seq Datasets" doc: | Analysis that includes single-cell - multiome ATAC and RNA-Seq or just - ATAC-Seq datasets run through "Single-cell - ATAC-Seq Dimensionality Reduction Analysis" - at any of the processing stages. + multiome RNA and ATAC-Seq or just + ATAC-Seq datasets run through + "Single-cell ATAC-Seq Dimensionality + Reduction Analysis" at any of the + processing stages. 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" 'sd:localLabel': true diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index 027c5f55..44d616d8 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -42,10 +42,11 @@ inputs: label: "Single-cell Analysis with PCA Transformed RNA-Seq Datasets" doc: | Analysis that includes single-cell - multiome ATAC and RNA-Seq or just - RNA-Seq datasets run through "Single-cell - RNA-Seq Dimensionality Reduction Analysis" - at any of the processing stages. + multiome RNA and ATAC-Seq or just + RNA-Seq datasets run through + "Single-cell RNA-Seq Dimensionality + Reduction Analysis" at any of the + processing stages. 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" 'sd:localLabel': true diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index 8b70c209..a7ade1e8 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -37,228 +37,121 @@ inputs: alias: type: string - label: "Experiment short name/alias" + label: "Analysis name" sd:preview: position: 1 query_data_rds: type: File - label: "Experiment run through both Single-cell RNA-Seq and ATAC-Seq Dimensionality Reduction Analyses" - doc: | - Path to the RDS file to load Seurat object from. This file should include - genes expression and chromatin accessibility information stored in the RNA - and ATAC assays correspondingly. Additionally, 'pca', 'rnaumap', 'atac_lsi' - and 'atacumap' dimensionality reductions should be present. + label: "Single-cell Analysis with both PCA and LSI Transformed Datasets" + doc: | + Analysis that includes single-cell + multiome RNA and ATAC-Seq datasets + run through both "Single-cell + RNA-Seq Dimensionality Reduction + Analysis" and "Single-cell ATAC-Seq + Dimensionality Reduction Analysis" + at any of the processing stages. 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" 'sd:localLabel': true + atac_fragments_file: + type: File? + secondaryFiles: + - .tbi + label: "Cell Ranger ARC Sample (optional)" + doc: | + "Cell Ranger ARC Sample" for generating + fragments coverage plots over the genes + of interest. + 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" + 'sd:localLabel': true + rna_dimensions: type: int? default: 40 - label: "Dimensionality from the 'pca' reduction to use when constructing weighted nearest-neighbor graph before clustering (from 1 to 50)" + label: "Target RNA dimensionality" doc: | - Dimensionality from the 'pca' reduction to use when constructing weighted - nearest-neighbor graph before clustering (from 1 to 50). If single value N - is provided, use from 1 to N dimensions. If multiple values are provided, - subset to only selected dimensions. - Default: from 1 to 10 + Number of principal components to be used + in constructing weighted nearest-neighbor + graph before clustering. Accepted values + range from 1 to 50. + Default: 40 atac_dimensions: type: int? default: 40 - label: "Dimensionality from the 'atac_lsi' reduction to use when constructing weighted nearest-neighbor graph before clustering (from 1 to 50)" + label: "Target ATAC dimensionality" doc: | - Dimensionality from the 'atac_lsi' reduction to use when constructing weighted - nearest-neighbor graph before clustering (from 1 to 50). If single value N - is provided, use from 2 to N dimensions. If multiple values are provided, - subset to only selected dimensions. - Default: from 2 to 10 - - cluster_algorithm: - type: - - "null" - - type: enum - symbols: - - "louvain" - - "mult-louvain" - - "slm" - - "leiden" - default: "slm" - label: "Algorithm for modularity optimization when running clustering" - doc: | - Algorithm for modularity optimization when running clustering. - Default: slm + Number of LSI components to be used in + constructing weighted nearest-neighbor + graph before clustering. Accepted values + range from 2 to 50. First dimension is + always excluded + Default: 40 resolution: type: float? default: 0.3 label: "Clustering resolution" doc: | - Clustering resolution applied to the constructed weighted nearest-neighbor - graph. Can be set as an array but only the first item from the list will - be used for cluster labels and gene/peak markers in the UCSC Cell Browser - when running with --cbbuild and --diffgenes/--diffpeaks parameters. - Default: 0.3, 0.5, 1.0 - - atac_fragments_file: - type: File? - secondaryFiles: - - .tbi - label: "Cell Ranger ARC Count/Aggregate Experiment" - doc: | - Count and barcode information for every ATAC fragment used in the loaded Seurat - object. File should be saved in TSV format with tbi-index file. - 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" - 'sd:localLabel': true - - genes_of_interest: - type: string? - default: null - label: "Genes of interest to build gene expression and Tn5 insertion frequency plots" - doc: | - Genes of interest to build gene expression and Tn5 insertion frequency plots - for the nearest peaks. If '--fragments' is not provided only gene expression - plots will be built. - Default: None + Resolution to define the "granularity" + of the clustered data. Larger values + lead to a bigger number of clusters. + Optimal resolution often increases + with the number of cells. + Default: 0.3 identify_diff_genes: type: boolean? - default: false - label: "Identify differentially expressed genes (putative gene markers) between each pair of clusters" - doc: | - Identify differentially expressed genes (putative gene markers) between each - pair of clusters for all resolutions. - Default: false - 'sd:layout': - advanced: true + default: true + label: "Find gene markers" + doc: | + Identify upregulated genes in each + cluster compared to all other cells. + Include only genes that are expressed + in at least 10% of the cells coming + from either current cluster or from + all other clusters together. + Exclude cells with log2FoldChange + values less than 0.25. Use Wilcoxon + Rank Sum test to calculate P-values. + Keep only genes with P-values lower + than 0.01. Adjust P-values for multiple + comparisons using Bonferroni correction. + Default: true identify_diff_peaks: type: boolean? default: false - label: "Identify differentially accessible peaks between each pair of clusters" - doc: | - Identify differentially accessible peaks between each pair of clusters for all resolutions. + label: "Find peak markers" + doc: | + Identify differentially accessible + peaks in each cluster compared to + all other cells. Include only peaks + that are present in at least 5% of + the cells coming from either current + cluster or from all other clusters + together. Exclude cells with + log2FoldChange values less than 0.25. + Use logistic regression framework to + calculate P-values. Keep only genes + with P-values lower than 0.01. Adjust + P-values for multiple comparisons + using Bonferroni correction. Default: false - 'sd:layout': - advanced: true - - rna_minimum_logfc: - type: float? - default: 0.25 - label: "Include only those genes that on average have log fold change difference in expression between every tested pair of clusters not lower than this value" - doc: | - For putative gene markers identification include only those genes that - on average have log fold change difference in expression between every - tested pair of clusters not lower than this value. Ignored if '--diffgenes' - is not set. - Default: 0.25 - 'sd:layout': - advanced: true - rna_minimum_pct: - type: float? - default: 0.1 - label: "Include only those genes that are detected in not lower than this fraction of cells in either of the two tested clusters" - doc: | - For putative gene markers identification include only those genes that - are detected in not lower than this fraction of cells in either of the - two tested clusters. Ignored if '--diffgenes' is not set. - Default: 0.1 - 'sd:layout': - advanced: true - - atac_minimum_logfc: - type: float? - default: 0.25 - label: "Include only those peaks that on average have log fold change difference in the chromatin accessibility between every tested pair of clusters not lower than this value" - doc: | - For differentially accessible peaks identification include only those peaks that - on average have log fold change difference in the chromatin accessibility between - every tested pair of clusters not lower than this value. Ignored if '--diffpeaks' - is not set. - Default: 0.25 - 'sd:layout': - advanced: true - - atac_minimum_pct: - type: float? - default: 0.05 - label: "Include only those peaks that are detected in not lower than this fraction of cells in either of the two tested clusters" - doc: | - For differentially accessible peaks identification include only those peaks that - are detected in not lower than this fraction of cells in either of the two tested - clusters. Ignored if '--diffpeaks' is not set. - Default: 0.05 - 'sd:layout': - advanced: true - - umap_spread: - type: float? - label: "UMAP Spread - the effective scale of embedded points (determines how clustered/clumped the embedded points are)" - default: 1 - doc: | - The effective scale of embedded points on UMAP. In combination with '--mindist' - it determines how clustered/clumped the embedded points are. - Default: 1 - 'sd:layout': - advanced: true - - umap_mindist: - type: float? - label: "UMAP Min. Dist. - controls how tightly the embedding is allowed compress points together" - default: 0.3 + genes_of_interest: + type: string? + default: null + label: "Genes of interest" doc: | - Controls how tightly the embedding is allowed compress points together on UMAP. - Larger values ensure embedded points are moreevenly distributed, while smaller - values allow the algorithm to optimise more accurately with regard to local structure. - Sensible values are in the range 0.001 to 0.5. - Default: 0.3 - 'sd:layout': - advanced: true - - umap_neighbors: - type: int? - label: "UMAP Neighbors Number - determines the number of neighboring points used" - default: 30 - doc: | - Determines the number of neighboring points used in UMAP. Larger values will result - in more global structure being preserved at the loss of detailed local structure. - In general this parameter should often be in the range 5 to 50. - Default: 30 - 'sd:layout': - advanced: true - - umap_metric: - type: - - "null" - - type: enum - symbols: - - "euclidean" - - "cosine" - - "correlation" - label: "UMAP Dist. Metric - the metric to use to compute distances in high dimensional space" - default: "cosine" - doc: | - The metric to use to compute distances in high dimensional space for UMAP. - Default: cosine - 'sd:layout': - advanced: true - - umap_method: - type: - - "null" - - type: enum - symbols: - - "uwot" - - "uwot-learn" - - "umap-learn" - label: "UMAP implementation to run (if set to 'umap-learn' use 'correlation' distance metric)" - default: "uwot" - doc: | - UMAP implementation to run. If set to 'umap-learn' use --umetric 'correlation' - Default: uwot - 'sd:layout': - advanced: true + Comma or space separated list of genes + of interest to visualize expression and + to generate fragments coverage plots. + Ignored if "Cell Ranger ARC Sample" input + is not provided. + Default: None color_theme: type: @@ -274,41 +167,12 @@ inputs: - "classic" - "void" default: "classic" - label: "Color theme for all generated plots" + label: "Plots color theme" doc: | - Color theme for all generated plots. One of gray, bw, linedraw, light, - dark, minimal, classic, void. + Color theme for all plots saved + as PNG files. Default: classic - 'sd:layout': - advanced: true - - parallel_memory_limit: - type: - - "null" - - type: enum - symbols: - - "32" - default: "32" - label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" - doc: | - Maximum memory in GB allowed to be shared between the workers - when using multiple --cpus. - Forced to 32 GB - 'sd:layout': - advanced: true - - vector_memory_limit: - type: - - "null" - - type: enum - symbols: - - "64" - default: "64" - label: "Maximum vector memory in GB allowed to be used by R" - doc: | - Maximum vector memory in GB allowed to be used by R. - Forced to 64 GB - 'sd:layout': + "sd:layout": advanced: true threads: @@ -317,12 +181,15 @@ inputs: - type: enum symbols: - "1" + - "2" default: "1" - label: "Number of cores/cpus to use" + label: "Cores/CPUs number" doc: | - Number of cores/cpus to use - Forced to 1 - 'sd:layout': + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 1 + "sd:layout": advanced: true @@ -334,14 +201,44 @@ outputs: - type: array items: File outputSource: sc_wnn_cluster/umap_res_plot_png - label: "Clustered cells UMAP" + label: "UMAP, colored by cluster" doc: | - Clustered cells UMAP. - PNG format + UMAP, colored by cluster 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Clustered cells UMAP' + tab: 'Per cluster' + Caption: 'UMAP, colored by cluster' + + umap_spl_ph_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/umap_spl_ph_res_plot_png + label: "UMAP, colored by cluster, split by cell cycle phase" + doc: | + UMAP, colored by cluster, + split by cell cycle phase + 'sd:visualPlugins': + - image: + tab: 'Per cluster' + Caption: 'UMAP, colored by cluster, split by cell cycle phase' + + cmp_gr_ph_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/cmp_gr_ph_spl_clst_res_plot_png + label: "Composition plot, colored by cell cycle phase, split by cluster, downsampled" + doc: | + Composition plot, colored by + cell cycle phase, split by + cluster, downsampled + 'sd:visualPlugins': + - image: + tab: 'Per cluster' + Caption: 'Composition plot, colored by cell cycle phase, split by cluster, downsampled' umap_spl_idnt_res_plot_png: type: @@ -349,14 +246,14 @@ outputs: - type: array items: File outputSource: sc_wnn_cluster/umap_spl_idnt_res_plot_png - label: "Split by dataset clustered cells UMAP" + label: "UMAP, colored by cluster, split by dataset" doc: | - Split by dataset clustered cells UMAP. - PNG format + UMAP, colored by cluster, + split by dataset 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by dataset clustered cells UMAP' + Caption: 'UMAP, colored by cluster, split by dataset' cmp_gr_clst_spl_idnt_res_plot_png: type: @@ -364,14 +261,15 @@ outputs: - type: array items: File outputSource: sc_wnn_cluster/cmp_gr_clst_spl_idnt_res_plot_png - label: "Grouped by cluster split by dataset cells composition plot. Downsampled." + label: "Composition plot, colored by cluster, split by dataset, downsampled" doc: | - Grouped by cluster split by dataset cells composition plot. Downsampled. - PNG format + Composition plot, colored by + cluster, split by dataset, + downsampled 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Grouped by cluster split by dataset cells composition plot. Downsampled.' + tab: 'Per dataset' + Caption: 'Composition plot, colored by cluster, split by dataset, downsampled' cmp_gr_idnt_spl_clst_res_plot_png: type: @@ -379,14 +277,28 @@ outputs: - type: array items: File outputSource: sc_wnn_cluster/cmp_gr_idnt_spl_clst_res_plot_png - label: "Grouped by dataset split by cluster cells composition plot. Downsampled." + label: "Composition plot, colored by dataset, split by cluster, downsampled" + doc: | + Composition plot, colored by + dataset, split by cluster, + downsampled + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Composition plot, colored by dataset, split by cluster, downsampled' + + cmp_gr_ph_spl_idnt_plot_png: + type: File? + outputSource: sc_wnn_cluster/cmp_gr_ph_spl_idnt_plot_png + label: "Composition plot, colored by cell cycle phase, split by dataset, downsampled" doc: | - Grouped by dataset split by cluster cells composition plot. Downsampled. - PNG format + Composition plot, colored by + cell cycle phase, split by + dataset, downsampled 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Grouped by dataset split by cluster cells composition plot. Downsampled.' + tab: 'Per dataset' + Caption: 'Composition plot, colored by cell cycle phase, split by dataset, downsampled' umap_spl_cnd_res_plot_png: type: @@ -394,14 +306,14 @@ outputs: - type: array items: File outputSource: sc_wnn_cluster/umap_spl_cnd_res_plot_png - label: "Split by grouping condition clustered cells UMAP" + label: "UMAP, colored by cluster, split by grouping condition" doc: | - Split by grouping condition clustered cells UMAP. - PNG format + UMAP, colored by cluster, + split by grouping condition 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'Split by grouping condition clustered cells UMAP' + Caption: 'UMAP, colored by cluster, split by grouping condition' cmp_gr_clst_spl_cnd_res_plot_png: type: @@ -409,14 +321,15 @@ outputs: - type: array items: File outputSource: sc_wnn_cluster/cmp_gr_clst_spl_cnd_res_plot_png - label: "Grouped by cluster split by condition cells composition plot. Downsampled." + label: "Composition plot, colored by cluster, split by grouping condition, downsampled" doc: | - Grouped by cluster split by condition cells composition plot. Downsampled. - PNG format + Composition plot, colored by + cluster, split by grouping + condition, downsampled 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'Grouped by cluster split by condition cells composition plot. Downsampled.' + Caption: 'Composition plot, colored by cluster, split by grouping condition, downsampled' cmp_gr_cnd_spl_clst_res_plot_png: type: @@ -424,71 +337,43 @@ outputs: - type: array items: File outputSource: sc_wnn_cluster/cmp_gr_cnd_spl_clst_res_plot_png - label: "Grouped by condition split by cluster cells composition plot. Downsampled." + label: "Composition plot, colored by grouping condition, split by cluster, downsampled" doc: | - Grouped by condition split by cluster cells composition plot. Downsampled. - PNG format + Composition plot, colored by + grouping condition, split by + cluster, downsampled 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'Grouped by condition split by cluster cells composition plot. Downsampled.' - - umap_spl_ph_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_wnn_cluster/umap_spl_ph_res_plot_png - label: "Split by cell cycle phase clustered cells UMAP" - doc: | - Split by cell cycle phase clustered cells UMAP. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Per dataset' - Caption: 'Split by cell cycle phase clustered cells UMAP' + Caption: 'Composition plot, colored by grouping condition, split by cluster, downsampled' - cmp_gr_ph_spl_idnt_plot_png: - type: File? - outputSource: sc_wnn_cluster/cmp_gr_ph_spl_idnt_plot_png - label: "Grouped by cell cycle phase split by dataset cells composition plot. Downsampled." - doc: | - Grouped by cell cycle phase split by dataset cells composition plot. Downsampled. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Per dataset' - Caption: 'Grouped by cell cycle phase split by dataset cells composition plot. Downsampled.' - - cmp_gr_ph_spl_clst_res_plot_png: + xpr_avg_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_wnn_cluster/cmp_gr_ph_spl_clst_res_plot_png - label: "Grouped by cell cycle phase split by cluster cells composition plot. Downsampled." + outputSource: sc_wnn_cluster/xpr_avg_res_plot_png + label: "Gene expression dot plot" doc: | - Grouped by cell cycle phase split by cluster cells composition plot. Downsampled. - PNG format + Gene expression dot plot 'sd:visualPlugins': - image: - tab: 'Per dataset' - Caption: 'Grouped by cell cycle phase split by cluster cells composition plot. Downsampled.' + tab: 'Genes of interest' + Caption: 'Gene expression dot plot' - xpr_avg_res_plot_png: + xpr_dnst_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_wnn_cluster/xpr_avg_res_plot_png - label: "Log normalized scaled average gene expression per cluster" + outputSource: sc_wnn_cluster/xpr_dnst_res_plot_png + label: "Gene expression violin plot" doc: | - Log normalized scaled average gene expression per cluster. - PNG format + Gene expression violin plot 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Log normalized scaled average gene expression per cluster' + tab: 'Genes of interest' + Caption: 'Gene expression violin plot' xpr_per_cell_plot_png: type: @@ -496,14 +381,13 @@ outputs: - type: array items: File outputSource: sc_wnn_cluster/xpr_per_cell_plot_png - label: "Log normalized gene expression on cells UMAP" + label: "UMAP, gene expression" doc: | - Log normalized gene expression on cells UMAP. - PNG format + UMAP, gene expression 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression on cells UMAP' + tab: 'Genes of interest' + Caption: 'UMAP, gene expression' xpr_per_cell_sgnl_plot_png: type: @@ -511,29 +395,27 @@ outputs: - type: array items: File outputSource: sc_wnn_cluster/xpr_per_cell_sgnl_plot_png - label: "Log normalized gene expression density on cells UMAP" + label: "UMAP, gene expression density" doc: | - Log normalized gene expression density on cells UMAP. - PNG format + UMAP, gene expression density 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression density on cells UMAP' + tab: 'Genes of interest' + Caption: 'UMAP, gene expression density' - xpr_dnst_res_plot_png: + xpr_htmp_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_wnn_cluster/xpr_dnst_res_plot_png - label: "Log normalized gene expression density per cluster" + outputSource: sc_wnn_cluster/xpr_htmp_res_plot_png + label: "Gene expression heatmap" doc: | - Log normalized gene expression density per cluster. - PNG format + Gene expression heatmap 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression density per cluster' + tab: 'Heatmap' + Caption: 'Gene expression heatmap' cvrg_res_plot_png: type: @@ -541,70 +423,55 @@ outputs: - type: array items: File outputSource: sc_wnn_cluster/cvrg_res_plot_png - label: "Tn5 insertion frequency plot around gene" + label: "Fragments coverage" doc: | - Tn5 insertion frequency plot around gene. - PNG format + Fragments coverage 'sd:visualPlugins': - image: tab: 'Genome coverage' - Caption: 'Tn5 insertion frequency plot around gene' - - xpr_htmp_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_wnn_cluster/xpr_htmp_res_plot_png - label: "Normalized gene expression heatmap grouped by cluster" - doc: | - Normalized gene expression heatmap grouped by cluster. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Gene expression' - Caption: 'Normalized gene expression heatmap grouped by cluster' + Caption: 'Fragments coverage' gene_markers_tsv: type: File? outputSource: sc_wnn_cluster/gene_markers_tsv - label: "Differentially expressed genes between each pair of clusters" + label: "Gene markers per cluster for all resolutions" doc: | - Differentially expressed genes between each pair of clusters for all resolutions. - TSV format + Gene markers per cluster for + all resolutions 'sd:visualPlugins': - syncfusiongrid: tab: 'Gene markers' - Title: 'Differentially expressed genes between each pair of clusters' + Title: 'Gene markers per cluster for all resolutions' peak_markers_tsv: type: File? outputSource: sc_wnn_cluster/peak_markers_tsv - label: "Differentially accessible peaks between each pair of clusters" + label: "Peak markers per cluster for all resolutions" doc: | - Differentially accessible peaks between each pair of clusters for all resolutions. - TSV format + Peak markers per cluster for + all resolutions 'sd:visualPlugins': - syncfusiongrid: - tab: 'Diff. peaks' - Title: 'Differentially accessible peaks between each pair of clusters' + tab: 'Peak markers' + Title: 'Peak markers per cluster for all resolutions' ucsc_cb_html_data: - type: Directory + type: Directory? outputSource: sc_wnn_cluster/ucsc_cb_html_data - label: "Directory with UCSC Cellbrowser html data" + label: "UCSC Cell Browser data" doc: | - Directory with UCSC Cellbrowser html data. + Directory with UCSC Cell Browser + data ucsc_cb_html_file: - type: File + type: File? outputSource: sc_wnn_cluster/ucsc_cb_html_file - label: "Open in UCSC Cell Browser" + label: "UCSC Cell Browser" doc: | - HTML index file from the directory with UCSC Cellbrowser html data. - 'sd:visualPlugins': + UCSC Cell Browser HTML index file + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" seurat_data_rds: @@ -649,7 +516,8 @@ steps: query_data_rds: query_data_rds rna_dimensions: rna_dimensions atac_dimensions: atac_dimensions - cluster_algorithm: cluster_algorithm + cluster_algorithm: + default: "slm" resolution: resolution atac_fragments_file: atac_fragments_file genes_of_interest: @@ -657,21 +525,20 @@ steps: valueFrom: $(split_features(self)) identify_diff_genes: identify_diff_genes identify_diff_peaks: identify_diff_peaks - rna_minimum_logfc: rna_minimum_logfc - rna_minimum_pct: rna_minimum_pct - atac_minimum_logfc: atac_minimum_logfc - atac_minimum_pct: atac_minimum_pct + rna_minimum_logfc: + default: 0.25 + rna_minimum_pct: + default: 0.1 + atac_minimum_logfc: + default: 0.25 + atac_minimum_pct: + default: 0.05 only_positive_diff_genes: default: true rna_test_to_use: default: wilcox atac_test_to_use: default: LR - umap_spread: umap_spread - umap_mindist: umap_mindist - umap_neighbors: umap_neighbors - umap_metric: umap_metric - umap_method: umap_method verbose: default: true export_ucsc_cb: @@ -680,11 +547,9 @@ steps: default: true color_theme: color_theme parallel_memory_limit: - source: parallel_memory_limit - valueFrom: $(parseInt(self)) + default: 32 vector_memory_limit: - source: vector_memory_limit - valueFrom: $(parseInt(self)) + default: 96 threads: source: threads valueFrom: $(parseInt(self)) @@ -763,5 +628,6 @@ s:creator: doc: | Single-cell WNN Cluster Analysis - Clusters multiome ATAC and RNA-Seq datasets, identifies gene markers - and differentially accessible peaks. \ No newline at end of file + Clusters multiome ATAC and RNA-Seq datasets, + identifies gene markers and differentially + accessible peaks. \ No newline at end of file From 909e52cd51482e12e9d545b60e6ddad9a078fd96 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 14 Jul 2023 11:34:33 -0400 Subject: [PATCH 055/162] Not important changes --- workflows/cellranger-arc-aggr.cwl | 2 +- workflows/cellranger-arc-count.cwl | 2 +- workflows/sc-atac-cluster.cwl | 2 +- workflows/sc-atac-reduce.cwl | 2 +- workflows/sc-multiome-filter.cwl | 2 +- workflows/sc-rna-cluster.cwl | 2 +- workflows/sc-rna-reduce.cwl | 2 +- workflows/sc-wnn-cluster.cwl | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/workflows/cellranger-arc-aggr.cwl b/workflows/cellranger-arc-aggr.cwl index d4f364f0..d34727a8 100644 --- a/workflows/cellranger-arc-aggr.cwl +++ b/workflows/cellranger-arc-aggr.cwl @@ -95,7 +95,7 @@ inputs: - "3" - "4" default: "4" - label: "Cores/CPUs number" + label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized diff --git a/workflows/cellranger-arc-count.cwl b/workflows/cellranger-arc-count.cwl index 25d7e949..7ec8ef24 100644 --- a/workflows/cellranger-arc-count.cwl +++ b/workflows/cellranger-arc-count.cwl @@ -129,7 +129,7 @@ inputs: - "3" - "4" default: "4" - label: "Cores/CPUs number" + label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index cac807ab..a3f24fda 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -150,7 +150,7 @@ inputs: - "1" - "2" default: "1" - label: "Cores/CPUs number" + label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index 04e31fec..36755889 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -213,7 +213,7 @@ inputs: - "1" - "2" default: "1" - label: "Cores/CPUs number" + label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index a80bcaee..34dc2091 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -412,7 +412,7 @@ inputs: - "3" - "4" default: "1" - label: "Cores/CPUs number" + label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index 44d616d8..53adc2ab 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -135,7 +135,7 @@ inputs: - "1" - "2" default: "1" - label: "Cores/CPUs number" + label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index f4091068..92df432b 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -289,7 +289,7 @@ inputs: - "1" - "2" default: "1" - label: "Cores/CPUs number" + label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index a7ade1e8..62c017d3 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -183,7 +183,7 @@ inputs: - "1" - "2" default: "1" - label: "Cores/CPUs number" + label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized From f88b6309d0fc406b60d4c1778c74a40789c7296f Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 14 Jul 2023 15:54:31 -0400 Subject: [PATCH 056/162] Refactor sc-ctype-assign workflow --- tools/sc-ctype-assign.cwl | 221 ++++++++---- workflows/sc-ctype-assign.cwl | 629 ++++++++++++++++------------------ 2 files changed, 436 insertions(+), 414 deletions(-) diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index c332280a..3fb6c3d0 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -242,6 +242,18 @@ inputs: Save Seurat data to h5ad file. Default: false + export_scope_data: + type: boolean? + inputBinding: + prefix: "--scope" + doc: | + Save Seurat data to SCope compatible + loom file. Only not normalized raw + counts from the RNA assay will be + saved. If loaded Seurat object doesn't + have RNA assay this parameter will be + ignored. Default: false + export_ucsc_cb: type: boolean? inputBinding: @@ -289,7 +301,7 @@ outputs: outputBinding: glob: "*_umap_rd_rnaumap.png" doc: | - Cells UMAP with assigned cell types (rnaumap dim. reduction). + UMAP, colored by cell type, RNA. PNG format umap_rd_rnaumap_plot_pdf: @@ -297,7 +309,7 @@ outputs: outputBinding: glob: "*_umap_rd_rnaumap.pdf" doc: | - Cells UMAP with assigned cell types (rnaumap dim. reduction). + UMAP, colored by cell type, RNA. PDF format umap_rd_atacumap_plot_png: @@ -305,7 +317,7 @@ outputs: outputBinding: glob: "*_umap_rd_atacumap.png" doc: | - Cells UMAP with assigned cell types (atacumap dim. reduction). + UMAP, colored by cell type, ATAC. PNG format umap_rd_atacumap_plot_pdf: @@ -313,7 +325,7 @@ outputs: outputBinding: glob: "*_umap_rd_atacumap.pdf" doc: | - Cells UMAP with assigned cell types (atacumap dim. reduction). + UMAP, colored by cell type, ATAC. PDF format umap_rd_wnnumap_plot_png: @@ -321,7 +333,7 @@ outputs: outputBinding: glob: "*_umap_rd_wnnumap.png" doc: | - Cells UMAP with assigned cell types (wnnumap dim. reduction). + UMAP, colored by cell type, WNN. PNG format umap_rd_wnnumap_plot_pdf: @@ -329,7 +341,7 @@ outputs: outputBinding: glob: "*_umap_rd_wnnumap.pdf" doc: | - Cells UMAP with assigned cell types (wnnumap dim. reduction). + UMAP, colored by cell type, WNN. PDF format umap_spl_idnt_rd_rnaumap_plot_png: @@ -337,7 +349,8 @@ outputs: outputBinding: glob: "*_umap_spl_idnt_rd_rnaumap.png" doc: | - Split by dataset cells UMAP with assigned cell types (rnaumap dim. reduction). + UMAP, colored by cell type, + split by dataset, RNA. PNG format umap_spl_idnt_rd_rnaumap_plot_pdf: @@ -345,7 +358,8 @@ outputs: outputBinding: glob: "*_umap_spl_idnt_rd_rnaumap.pdf" doc: | - Split by dataset cells UMAP with assigned cell types (rnaumap dim. reduction). + UMAP, colored by cell type, + split by dataset, RNA. PDF format umap_spl_idnt_rd_atacumap_plot_png: @@ -353,7 +367,8 @@ outputs: outputBinding: glob: "*_umap_spl_idnt_rd_atacumap.png" doc: | - Split by dataset cells UMAP with assigned cell types (atacumap dim. reduction). + UMAP, colored by cell type, + split by dataset, ATAC. PNG format umap_spl_idnt_rd_atacumap_plot_pdf: @@ -361,7 +376,8 @@ outputs: outputBinding: glob: "*_umap_spl_idnt_rd_atacumap.pdf" doc: | - Split by dataset cells UMAP with assigned cell types (atacumap dim. reduction). + UMAP, colored by cell type, + split by dataset, ATAC. PDF format umap_spl_idnt_rd_wnnumap_plot_png: @@ -369,7 +385,8 @@ outputs: outputBinding: glob: "*_umap_spl_idnt_rd_wnnumap.png" doc: | - Split by dataset cells UMAP with assigned cell types (wnnumap dim. reduction). + UMAP, colored by cell type, + split by dataset, WNN. PNG format umap_spl_idnt_rd_wnnumap_plot_pdf: @@ -377,7 +394,8 @@ outputs: outputBinding: glob: "*_umap_spl_idnt_rd_wnnumap.pdf" doc: | - Split by dataset cells UMAP with assigned cell types (wnnumap dim. reduction). + UMAP, colored by cell type, + split by dataset, WNN. PDF format umap_spl_cnd_rd_rnaumap_plot_png: @@ -385,7 +403,8 @@ outputs: outputBinding: glob: "*_umap_spl_cnd_rd_rnaumap.png" doc: | - Split by grouping condition cells UMAP with assigned cell types (rnaumap dim. reduction). + UMAP, colored by cell type, split + by grouping condition, RNA. PNG format umap_spl_cnd_rd_rnaumap_plot_pdf: @@ -393,7 +412,8 @@ outputs: outputBinding: glob: "*_umap_spl_cnd_rd_rnaumap.pdf" doc: | - Split by grouping condition cells UMAP with assigned cell types (rnaumap dim. reduction). + UMAP, colored by cell type, split + by grouping condition, RNA. PDF format umap_spl_cnd_rd_atacumap_plot_png: @@ -401,7 +421,8 @@ outputs: outputBinding: glob: "*_umap_spl_cnd_rd_atacumap.png" doc: | - Split by grouping condition cells UMAP with assigned cell types (atacumap dim. reduction). + UMAP, colored by cell type, split + by grouping condition, ATAC. PNG format umap_spl_cnd_rd_atacumap_plot_pdf: @@ -409,7 +430,8 @@ outputs: outputBinding: glob: "*_umap_spl_cnd_rd_atacumap.pdf" doc: | - Split by grouping condition cells UMAP with assigned cell types (atacumap dim. reduction). + UMAP, colored by cell type, split + by grouping condition, ATAC. PDF format umap_spl_cnd_rd_wnnumap_plot_png: @@ -417,7 +439,8 @@ outputs: outputBinding: glob: "*_umap_spl_cnd_rd_wnnumap.png" doc: | - Split by grouping condition cells UMAP with assigned cell types (wnnumap dim. reduction). + UMAP, colored by cell type, split + by grouping condition, WNN. PNG format umap_spl_cnd_rd_wnnumap_plot_pdf: @@ -425,7 +448,8 @@ outputs: outputBinding: glob: "*_umap_spl_cnd_rd_wnnumap.pdf" doc: | - Split by grouping condition cells UMAP with assigned cell types (wnnumap dim. reduction). + UMAP, colored by cell type, split + by grouping condition, WNN. PDF format umap_spl_ph_rd_rnaumap_plot_png: @@ -433,7 +457,8 @@ outputs: outputBinding: glob: "*_umap_spl_ph_rd_rnaumap.png" doc: | - Split by cell cycle phase cells UMAP with assigned cell types (rnaumap dim. reduction). + UMAP, colored by cell type, split + by cell cycle phase, RNA. PNG format umap_spl_ph_rd_rnaumap_plot_pdf: @@ -441,7 +466,8 @@ outputs: outputBinding: glob: "*_umap_spl_ph_rd_rnaumap.pdf" doc: | - Split by cell cycle phase cells UMAP with assigned cell types (rnaumap dim. reduction). + UMAP, colored by cell type, split + by cell cycle phase, RNA. PDF format umap_spl_ph_rd_atacumap_plot_png: @@ -449,7 +475,8 @@ outputs: outputBinding: glob: "*_umap_spl_ph_rd_atacumap.png" doc: | - Split by cell cycle phase cells UMAP with assigned cell types (atacumap dim. reduction). + UMAP, colored by cell type, split + by cell cycle phase, ATAC. PNG format umap_spl_ph_rd_atacumap_plot_pdf: @@ -457,7 +484,8 @@ outputs: outputBinding: glob: "*_umap_spl_ph_rd_atacumap.pdf" doc: | - Split by cell cycle phase cells UMAP with assigned cell types (atacumap dim. reduction). + UMAP, colored by cell type, split + by cell cycle phase, ATAC. PDF format umap_spl_ph_rd_wnnumap_plot_png: @@ -465,7 +493,8 @@ outputs: outputBinding: glob: "*_umap_spl_ph_rd_wnnumap.png" doc: | - Split by cell cycle phase cells UMAP with assigned cell types (wnnumap dim. reduction). + UMAP, colored by cell type, split + by cell cycle phase, WNN. PNG format umap_spl_ph_rd_wnnumap_plot_pdf: @@ -473,7 +502,8 @@ outputs: outputBinding: glob: "*_umap_spl_ph_rd_wnnumap.pdf" doc: | - Split by cell cycle phase cells UMAP with assigned cell types (wnnumap dim. reduction). + UMAP, colored by cell type, split + by cell cycle phase, WNN. PDF format cmp_gr_ctyp_spl_idnt_plot_png: @@ -481,7 +511,8 @@ outputs: outputBinding: glob: "*_cmp_gr_ctyp_spl_idnt.png" doc: | - Grouped by cell type split by dataset cells composition plot. Downsampled. + Composition plot, colored by cell + type, split by dataset, downsampled. PNG format cmp_gr_ctyp_spl_idnt_plot_pdf: @@ -489,7 +520,8 @@ outputs: outputBinding: glob: "*_cmp_gr_ctyp_spl_idnt.pdf" doc: | - Grouped by cell type split by dataset cells composition plot. Downsampled. + Composition plot, colored by cell + type, split by dataset, downsampled. PDF format cmp_gr_idnt_spl_ctyp_plot_png: @@ -497,7 +529,9 @@ outputs: outputBinding: glob: "*_cmp_gr_idnt_spl_ctyp.png" doc: | - Grouped by dataset split by cell type cells composition plot. Downsampled. + Composition plot, colored by + dataset, split by cell type, + downsampled. PNG format cmp_gr_idnt_spl_ctyp_plot_pdf: @@ -505,7 +539,9 @@ outputs: outputBinding: glob: "*_cmp_gr_idnt_spl_ctyp.pdf" doc: | - Grouped by dataset split by cell type cells composition plot. Downsampled. + Composition plot, colored by + dataset, split by cell type, + downsampled. PDF format cmp_gr_ph_spl_idnt_plot_png: @@ -513,7 +549,9 @@ outputs: outputBinding: glob: "*_cmp_gr_ph_spl_idnt.png" doc: | - Grouped by cell cycle phase split by dataset cells composition plot. Downsampled. + Composition plot, colored by cell + cycle phase, split by dataset, + downsampled. PNG format cmp_gr_ph_spl_idnt_plot_pdf: @@ -521,7 +559,9 @@ outputs: outputBinding: glob: "*_cmp_gr_ph_spl_idnt.pdf" doc: | - Grouped by cell cycle phase split by dataset cells composition plot. Downsampled. + Composition plot, colored by cell + cycle phase, split by dataset, + downsampled. PDF format cmp_gr_ctyp_spl_cnd_plot_png: @@ -529,7 +569,9 @@ outputs: outputBinding: glob: "*_cmp_gr_ctyp_spl_cnd.png" doc: | - Grouped by cell type split by condition cells composition plot. Downsampled. + Composition plot, colored by cell + type, split by grouping condition, + downsampled. PNG format cmp_gr_ctyp_spl_cnd_plot_pdf: @@ -537,7 +579,9 @@ outputs: outputBinding: glob: "*_cmp_gr_ctyp_spl_cnd.pdf" doc: | - Grouped by cell type split by condition cells composition plot. Downsampled. + Composition plot, colored by cell + type, split by grouping condition, + downsampled. PDF format cmp_gr_cnd_spl_ctyp_plot_png: @@ -545,7 +589,9 @@ outputs: outputBinding: glob: "*_cmp_gr_cnd_spl_ctyp.png" doc: | - Grouped by condition split by cell type cells composition plot. Downsampled. + Composition plot, colored by + grouping condition, split by + cell type, downsampled. PNG format cmp_gr_cnd_spl_ctyp_plot_pdf: @@ -553,7 +599,9 @@ outputs: outputBinding: glob: "*_cmp_gr_cnd_spl_ctyp.pdf" doc: | - Grouped by condition split by cell type cells composition plot. Downsampled. + Composition plot, colored by + grouping condition, split by + cell type, downsampled. PDF format cmp_gr_ph_spl_ctyp_plot_png: @@ -561,7 +609,9 @@ outputs: outputBinding: glob: "*_cmp_gr_ph_spl_ctyp.png" doc: | - Grouped by cell cycle phase split by cell type cells composition plot. Downsampled. + Composition plot, colored by cell + cycle phase, split by cell type, + downsampled. PNG format cmp_gr_ph_spl_ctyp_plot_pdf: @@ -569,7 +619,9 @@ outputs: outputBinding: glob: "*_cmp_gr_ph_spl_ctyp.pdf" doc: | - Grouped by cell cycle phase split by cell type cells composition plot. Downsampled. + Composition plot, colored by cell + cycle phase, split by cell type, + downsampled. PDF format xpr_avg_plot_png: @@ -577,7 +629,7 @@ outputs: outputBinding: glob: "*_xpr_avg.png" doc: | - Log normalized scaled average gene expression per cell type. + Gene expression dot plot. PNG format xpr_avg_plot_pdf: @@ -585,7 +637,7 @@ outputs: outputBinding: glob: "*_xpr_avg.pdf" doc: | - Log normalized scaled average gene expression per cell type. + Gene expression dot plot. PDF format xpr_dnst_plot_png: @@ -596,7 +648,7 @@ outputs: outputBinding: glob: "*_xpr_dnst_*.png" doc: | - Log normalized gene expression density per cell type. + Gene expression violin plot. PNG format xpr_dnst_plot_pdf: @@ -607,7 +659,7 @@ outputs: outputBinding: glob: "*_xpr_dnst_*.pdf" doc: | - Log normalized gene expression density per cell type. + Gene expression violin plot. PDF format xpr_per_cell_rd_rnaumap_plot_png: @@ -618,7 +670,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_rnaumap_*.png" doc: | - Log normalized gene expression on cells UMAP with assigned cell types (rnaumap dim. reduction). + UMAP, gene expression, RNA. PNG format xpr_per_cell_rd_rnaumap_plot_pdf: @@ -629,7 +681,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_rnaumap_*.pdf" doc: | - Log normalized gene expression on cells UMAP with assigned cell types (rnaumap dim. reduction). + UMAP, gene expression, RNA. PDF format xpr_per_cell_rd_atacumap_plot_png: @@ -640,7 +692,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_atacumap_*.png" doc: | - Log normalized gene expression on cells UMAP with assigned cell types (atacumap dim. reduction). + UMAP, gene expression, ATAC. PNG format xpr_per_cell_rd_atacumap_plot_pdf: @@ -651,7 +703,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_atacumap_*.pdf" doc: | - Log normalized gene expression on cells UMAP with assigned cell types (atacumap dim. reduction). + UMAP, gene expression, ATAC. PDF format xpr_per_cell_rd_wnnumap_plot_png: @@ -662,7 +714,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_wnnumap_*.png" doc: | - Log normalized gene expression on cells UMAP with assigned cell types (wnnumap dim. reduction). + UMAP, gene expression, WNN. PNG format xpr_per_cell_rd_wnnumap_plot_pdf: @@ -673,7 +725,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_wnnumap_*.pdf" doc: | - Log normalized gene expression on cells UMAP with assigned cell types (wnnumap dim. reduction). + UMAP, gene expression, WNN. PDF format xpr_per_cell_sgnl_rd_rnaumap_plot_png: @@ -684,7 +736,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_sgnl_rd_rnaumap_*.png" doc: | - Log normalized gene expression density on cells UMAP with assigned cell types (rnaumap dim. reduction). + UMAP, gene expression density, RNA. PNG format xpr_per_cell_sgnl_rd_rnaumap_plot_pdf: @@ -695,7 +747,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_sgnl_rd_rnaumap_*.pdf" doc: | - Log normalized gene expression density on cells UMAP with assigned cell types (rnaumap dim. reduction). + UMAP, gene expression density, RNA. PDF format xpr_per_cell_sgnl_rd_atacumap_plot_png: @@ -706,7 +758,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_sgnl_rd_atacumap_*.png" doc: | - Log normalized gene expression density on cells UMAP with assigned cell types (atacumap dim. reduction). + UMAP, gene expression density, ATAC. PNG format xpr_per_cell_sgnl_rd_atacumap_plot_pdf: @@ -717,7 +769,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_sgnl_rd_atacumap_*.pdf" doc: | - Log normalized gene expression density on cells UMAP with assigned cell types (atacumap dim. reduction). + UMAP, gene expression density, ATAC. PDF format xpr_per_cell_sgnl_rd_wnnumap_plot_png: @@ -728,7 +780,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_sgnl_rd_wnnumap_*.png" doc: | - Log normalized gene expression density on cells UMAP with assigned cell types (wnnumap dim. reduction). + UMAP, gene expression density, WNN. PNG format xpr_per_cell_sgnl_rd_wnnumap_plot_pdf: @@ -739,7 +791,7 @@ outputs: outputBinding: glob: "*_xpr_per_cell_sgnl_rd_wnnumap_*.pdf" doc: | - Log normalized gene expression density on cells UMAP with assigned cell types (wnnumap dim. reduction). + UMAP, gene expression density, WNN. PDF format cvrg_plot_png: @@ -750,7 +802,7 @@ outputs: outputBinding: glob: "*_cvrg_*.png" doc: | - Tn5 insertion frequency plot around gene. + Fragments coverage. PNG format cvrg_plot_pdf: @@ -761,7 +813,7 @@ outputs: outputBinding: glob: "*_cvrg_*.pdf" doc: | - Tn5 insertion frequency plot around gene. + Fragments coverage. PDF format xpr_htmp_plot_png: @@ -769,7 +821,7 @@ outputs: outputBinding: glob: "*_xpr_htmp.png" doc: | - Normalized gene expression heatmap grouped by cell type. + Gene expression heatmap. PNG format xpr_htmp_plot_pdf: @@ -777,7 +829,7 @@ outputs: outputBinding: glob: "*_xpr_htmp.pdf" doc: | - Normalized gene expression heatmap grouped by cell type. + Gene expression heatmap. PDF format gene_markers_tsv: @@ -785,7 +837,8 @@ outputs: outputBinding: glob: "*_gene_markers.tsv" doc: | - Differentially expressed genes between each pair of cell types. + Differentially expressed genes + between each pair of cell types. TSV format peak_markers_tsv: @@ -793,7 +846,8 @@ outputs: outputBinding: glob: "*_peak_markers.tsv" doc: | - Differentially accessible peaks between each pair of cell types. + Differentially accessible peaks + between each pair of cell types. TSV format ucsc_cb_config_data: @@ -801,21 +855,24 @@ outputs: outputBinding: glob: "*_cellbrowser" doc: | - Directory with UCSC Cellbrowser configuration data. + Directory with UCSC Cellbrowser + configuration data. ucsc_cb_html_data: type: Directory? outputBinding: glob: "*_cellbrowser/html_data" doc: | - Directory with UCSC Cellbrowser html data. + Directory with UCSC Cellbrowser + html data. ucsc_cb_html_file: type: File? outputBinding: glob: "*_cellbrowser/html_data/index.html" doc: | - HTML index file from the directory with UCSC Cellbrowser html data. + HTML index file from the directory + with UCSC Cellbrowser html data. seurat_data_rds: type: File @@ -838,6 +895,14 @@ outputs: doc: | Reduced Seurat data in h5ad format + seurat_data_scope: + type: File? + outputBinding: + glob: "*_data.loom" + doc: | + Reduced Seurat data in SCope + compatible loom format + stdout_log: type: stdout @@ -900,21 +965,27 @@ s:creator: doc: | Single-cell Manual Cell Type Assignment - Assigns cell types for clusters based on the provided metadata file. + Assigns cell types for clusters based on + the provided metadata file. s:about: | - usage: sc_ctype_assign.R - [-h] --query QUERY --celltypes CELLTYPES --source SOURCE --target - TARGET [--diffgenes] [--diffpeaks] [--rnalogfc RNALOGFC] - [--rnaminpct RNAMINPCT] [--rnaonlypos] - [--rnatestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] - [--ataclogfc ATACLOGFC] [--atacminpct ATACMINPCT] - [--atactestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] - [--fragments FRAGMENTS] [--genes [GENES [GENES ...]]] [--pdf] - [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] - [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] - [--cpus CPUS] [--memory MEMORY] + usage: sc_ctype_assign.R [-h] --query QUERY --celltypes + CELLTYPES --source SOURCE --target + TARGET [--diffgenes] [--diffpeaks] + [--rnalogfc RNALOGFC] + [--rnaminpct RNAMINPCT] [--rnaonlypos] + [--rnatestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] + [--ataclogfc ATACLOGFC] + [--atacminpct ATACMINPCT] + [--atactestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] + [--fragments FRAGMENTS] + [--genes [GENES [GENES ...]]] [--pdf] + [--verbose] [--h5seurat] [--h5ad] + [--cbbuild] [--scope] + [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] Single-cell Manual Cell Type Assignment @@ -998,6 +1069,10 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save Seurat data to h5ad file. Default: false --cbbuild Export results to UCSC Cell Browser. Default: false + --scope Save Seurat data to SCope compatible loom file. Only + not normalized raw counts from the RNA assay will be + saved. If loaded Seurat object doesn't have RNA assay + this parameter will be ignored. Default: false --output OUTPUT Output prefix. Default: ./sc --theme {gray,bw,linedraw,light,dark,minimal,classic,void} Color theme for all generated plots. Default: classic diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index b1dfa427..8a9c9dc4 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -28,7 +28,6 @@ requirements: 'sd:upstream': sc_tools_sample: - - "sc-ctype-assign.cwl" - "sc-rna-cluster.cwl" - "sc-atac-cluster.cwl" - "sc-wnn-cluster.cwl" @@ -41,21 +40,36 @@ inputs: alias: type: string - label: "Experiment short name/alias" + label: "Analysis name" sd:preview: position: 1 query_data_rds: type: File - label: "Experiment run through any of the Single-cell Cluster Analysis" - doc: | - Path to the RDS file to load Seurat object from. This file should include - genes expression and/or chromatin accessibility information stored in the RNA - and ATAC assays correspondingly. Additionally, 'rnaumap', and/or 'atacumap', - and/or 'wnnumap' dimensionality reductions should be present. + label: "Single-cell Cluster Analysis" + doc: | + Analysis that includes clustered + single-cell data and was run through + at least one of the following workflows: + "Single-cell RNA-Seq Cluster Analysis", + "Single-cell ATAC-Seq Cluster Analysis", + "Single-cell WNN Cluster Analysis", - + at any of the processing stages. 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" 'sd:localLabel': true + atac_fragments_file: + type: File? + secondaryFiles: + - .tbi + label: "Cell Ranger ARC Sample (optional)" + doc: | + "Cell Ranger ARC Sample" for generating + fragments coverage plots over the genes + of interest. + 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" + 'sd:localLabel': true + query_reduction: type: - "null" @@ -65,124 +79,79 @@ inputs: - "ATAC" - "WNN" default: "RNA" - label: "Select clusters based on" + label: "Dimensionality reduction" doc: | - If set to 'RNA', then 'get_query_column' will have suffix 'rna_res'. - If set to 'ATAC', then 'get_query_column' will have suffix 'atac_res'. - If set to 'WNN', then 'get_query_column' will have suffix 'wsnn_res'. + Dimensionality reduction for which + cluster names should be assigned. query_resolution: type: float - label: "Clustering resolution to assign cell types to" - doc: | - Clustering resolution defines 'query_source_column' and 'query_target_column' - inputs for 'assign_cell_types' step - - atac_fragments_file: - type: File? - secondaryFiles: - - .tbi - label: "Cell Ranger ARC Count/Aggregate Experiment for ATAC or WNN clusters" - doc: | - Count and barcode information for every ATAC fragment used in the loaded Seurat - object. File should be saved in TSV format with tbi-index file. Ignored if the - loaded Seurat object doesn't include ATAC assay. - 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" - 'sd:localLabel': true - - genes_of_interest: - type: string? - default: null - label: "Genes of interest to build gene expression and/or Tn5 insertion frequency plots for the nearest peaks" + label: "Clustering resolution" doc: | - Genes of interest to build gene expression and/or Tn5 insertion frequency plots - for the nearest peaks. To build gene expression plots the loaded Seurat object - should include RNA assay. To build Tn5 insertion frequency plots for the nearest - peaks the loaded Seurat object should include ATAC assay as well as the --fragments - file should be provided. - Default: None + Clustering resolution for the selected + "Dimensionality reduction" to be used + for cluster names assignment. cell_type_data: type: File - label: "TSV/CSV cell types metadata file with 'cluster' and 'type' columns" + label: "Cell types" doc: | - Path to the TSV/CSV file for manual cell type assignment for each of the clusters. - First column - 'cluster', second column may have arbitrary name. + A TSV/CSV file with the names for each + cluster defined by "Clustering resolution" + and "Dimensionality reduction" parameters. + The file should have two columns named + 'cluster' and 'celltype'. identify_diff_genes: type: boolean? - default: false - label: "Identify differentially expressed genes for assigned cell types" - doc: | - Identify differentially expressed genes (putative gene markers) for - assigned cell types. Ignored if loaded Seurat object doesn't include - genes expression information stored in the RNA assay. - Default: false - 'sd:layout': - advanced: true + default: true + label: "Find gene markers" + doc: | + Identify upregulated genes in each + cell type compared to all other cells. + Include only genes that are expressed + in at least 10% of the cells coming + from either current cell type or from + all other cell types together. + Exclude cells with log2FoldChange + values less than 0.25. Use Wilcoxon + Rank Sum test to calculate P-values. + Keep only genes with P-values lower + than 0.01. Adjust P-values for multiple + comparisons using Bonferroni correction. + Default: true identify_diff_peaks: type: boolean? default: false - label: "Identify differentially accessible peaks for assigned cell types" - doc: | - Identify differentially accessible peaks for assigned cell types. Ignored - if loaded Seurat object doesn't include chromatin accessibility information - stored in the ATAC assay. + label: "Find peak markers" + doc: | + Identify differentially accessible + peaks in each cell type compared to + all other cells. Include only peaks + that are present in at least 5% of + the cells coming from either current + cell type or from all other cell + types together. Exclude cells with + log2FoldChange values less than 0.25. + Use logistic regression framework to + calculate P-values. Keep only genes + with P-values lower than 0.01. Adjust + P-values for multiple comparisons + using Bonferroni correction. Default: false - 'sd:layout': - advanced: true - rna_minimum_logfc: - type: float? - default: 0.25 - label: "Include only those genes that on average have log fold change difference in expression between every tested pair of cell types not lower than this value" - doc: | - For putative gene markers identification include only those genes that - on average have log fold change difference in expression between every - tested pair of cell types not lower than this value. Ignored if '--diffgenes' - is not set or RNA assay is not present. - Default: 0.25 - 'sd:layout': - advanced: true - - rna_minimum_pct: - type: float? - default: 0.1 - label: "Include only those genes that are detected in not lower than this fraction of cells in either of the two tested cell types" - doc: | - For putative gene markers identification include only those genes that - are detected in not lower than this fraction of cells in either of the - two tested cell types. Ignored if '--diffgenes' is not set or RNA assay - is not present. - Default: 0.1 - 'sd:layout': - advanced: true - - atac_minimum_logfc: - type: float? - default: 0.25 - label: "Include only those peaks that on average have log fold change difference in the chromatin accessibility between every tested pair of cell types not lower than this value" - doc: | - For differentially accessible peaks identification include only those peaks that - on average have log fold change difference in the chromatin accessibility between - every tested pair of cell types not lower than this value. Ignored if '--diffpeaks' - is not set or ATAC assay is not present. - Default: 0.25 - 'sd:layout': - advanced: true - - atac_minimum_pct: - type: float? - default: 0.05 - label: "Include only those peaks that are detected in not lower than this fraction of cells in either of the two tested cell types" + genes_of_interest: + type: string? + default: null + label: "Genes of interest" doc: | - For differentially accessible peaks identification include only those peaks that - are detected in not lower than this fraction of cells in either of the two tested - cell types. Ignored if '--diffpeaks' is not set or ATAC assay is not present. - Default: 0.05 - 'sd:layout': - advanced: true + Comma or space separated list of genes + of interest to visualize expression and + to generate fragments coverage plots. + Ignored if "Cell Ranger ARC Sample" input + is not provided. + Default: None color_theme: type: @@ -198,41 +167,12 @@ inputs: - "classic" - "void" default: "classic" - label: "Color theme for all generated plots" + label: "Plots color theme" doc: | - Color theme for all generated plots. One of gray, bw, linedraw, light, - dark, minimal, classic, void. + Color theme for all plots saved + as PNG files. Default: classic - 'sd:layout': - advanced: true - - parallel_memory_limit: - type: - - "null" - - type: enum - symbols: - - "32" - default: "32" - label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" - doc: | - Maximum memory in GB allowed to be shared between the workers - when using multiple --cpus. - Forced to 32 GB - 'sd:layout': - advanced: true - - vector_memory_limit: - type: - - "null" - - type: enum - symbols: - - "64" - default: "64" - label: "Maximum vector memory in GB allowed to be used by R" - doc: | - Maximum vector memory in GB allowed to be used by R. - Forced to 64 GB - 'sd:layout': + "sd:layout": advanced: true threads: @@ -241,12 +181,15 @@ inputs: - type: enum symbols: - "1" + - "2" default: "1" - label: "Number of cores/cpus to use" + label: "Cores/CPUs" doc: | - Number of cores/cpus to use - Forced to 1 - 'sd:layout': + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 1 + "sd:layout": advanced: true @@ -255,230 +198,231 @@ outputs: umap_rd_rnaumap_plot_png: type: File? outputSource: ctype_assign/umap_rd_rnaumap_plot_png - label: "Clustered cells RNA UMAP with assigned cell types" + label: "UMAP, colored by cell type, RNA" doc: | - Cells UMAP with assigned cell types (rnaumap dim. reduction). - PNG format + UMAP, colored by cell type, RNA 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Clustered cells RNA UMAP with assigned cell types' + tab: 'Per cell type' + Caption: 'UMAP, colored by cell type, RNA' umap_rd_atacumap_plot_png: type: File? outputSource: ctype_assign/umap_rd_atacumap_plot_png - label: "Clustered cells ATAC UMAP with assigned cell types" + label: "UMAP, colored by cell type, ATAC" doc: | - Cells UMAP with assigned cell types (atacumap dim. reduction). - PNG format + UMAP, colored by cell type, ATAC 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Clustered cells ATAC UMAP with assigned cell types' + tab: 'Per cell type' + Caption: 'UMAP, colored by cell type, ATAC' umap_rd_wnnumap_plot_png: type: File? outputSource: ctype_assign/umap_rd_wnnumap_plot_png - label: "Clustered cells WNN UMAP with assigned cell types" + label: "UMAP, colored by cell type, WNN" doc: | - Cells UMAP with assigned cell types (wnnumap dim. reduction). - PNG format + UMAP, colored by cell type, WNN 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Clustered cells WNN UMAP with assigned cell types' + tab: 'Per cell type' + Caption: 'UMAP, colored by cell type, WNN' - umap_spl_idnt_rd_rnaumap_plot_png: + umap_spl_ph_rd_rnaumap_plot_png: type: File? - outputSource: ctype_assign/umap_spl_idnt_rd_rnaumap_plot_png - label: "Split by dataset clustered cells RNA UMAP with assigned cell types" + outputSource: ctype_assign/umap_spl_ph_rd_rnaumap_plot_png + label: "UMAP, colored by cell type, split by cell cycle phase, RNA" doc: | - Split by dataset cells UMAP with assigned cell types (rnaumap dim. reduction). - PNG format + UMAP, colored by cell type, split + by cell cycle phase, RNA 'sd:visualPlugins': - image: - tab: 'Per dataset' - Caption: 'Split by dataset clustered cells RNA UMAP with assigned cell types' + tab: 'Per cell type' + Caption: 'UMAP, colored by cell type, split by cell cycle phase, RNA' - umap_spl_idnt_rd_atacumap_plot_png: + umap_spl_ph_rd_atacumap_plot_png: type: File? - outputSource: ctype_assign/umap_spl_idnt_rd_atacumap_plot_png - label: "Split by dataset clustered cells ATAC UMAP with assigned cell types" + outputSource: ctype_assign/umap_spl_ph_rd_atacumap_plot_png + label: "UMAP, colored by cell type, split by cell cycle phase, ATAC" doc: | - Split by dataset cells UMAP with assigned cell types (atacumap dim. reduction). - PNG format + UMAP, colored by cell type, split + by cell cycle phase, ATAC 'sd:visualPlugins': - image: - tab: 'Per dataset' - Caption: 'Split by dataset clustered cells ATAC UMAP with assigned cell types' + tab: 'Per cell type' + Caption: 'UMAP, colored by cell type, split by cell cycle phase, ATAC' - umap_spl_idnt_rd_wnnumap_plot_png: + umap_spl_ph_rd_wnnumap_plot_png: type: File? - outputSource: ctype_assign/umap_spl_idnt_rd_wnnumap_plot_png - label: "Split by dataset clustered cells WNN UMAP with assigned cell types" + outputSource: ctype_assign/umap_spl_ph_rd_wnnumap_plot_png + label: "UMAP, colored by cell type, split by cell cycle phase, WNN" doc: | - Split by dataset cells UMAP with assigned cell types (wnnumap dim. reduction). - PNG format + UMAP, colored by cell type, split + by cell cycle phase, WNN 'sd:visualPlugins': - image: - tab: 'Per dataset' - Caption: 'Split by dataset clustered cells WNN UMAP with assigned cell types' + tab: 'Per cell type' + Caption: 'UMAP, colored by cell type, split by cell cycle phase, WNN' - umap_spl_cnd_rd_rnaumap_plot_png: + cmp_gr_ph_spl_ctyp_plot_png: type: File? - outputSource: ctype_assign/umap_spl_cnd_rd_rnaumap_plot_png - label: "Split by grouping condition clustered cells RNA UMAP with assigned cell types" + outputSource: ctype_assign/cmp_gr_ph_spl_ctyp_plot_png + label: "Composition plot, colored by cell cycle phase, split by cell type, downsampled" doc: | - Split by grouping condition cells UMAP with assigned cell types (rnaumap dim. reduction). - PNG format + Composition plot, colored by cell + cycle phase, split by cell type, + downsampled 'sd:visualPlugins': - image: - tab: 'Per group' - Caption: 'Split by grouping condition clustered cells RNA UMAP with assigned cell types' + tab: 'Per dataset' + Caption: 'Composition plot, colored by cell cycle phase, split by cell type, downsampled' - umap_spl_cnd_rd_atacumap_plot_png: + umap_spl_idnt_rd_rnaumap_plot_png: type: File? - outputSource: ctype_assign/umap_spl_cnd_rd_atacumap_plot_png - label: "Split by grouping condition clustered cells ATAC UMAP with assigned cell types" + outputSource: ctype_assign/umap_spl_idnt_rd_rnaumap_plot_png + label: "UMAP, colored by cell type, split by dataset, RNA" doc: | - Split by grouping condition cells UMAP with assigned cell types (atacumap dim. reduction). - PNG format + UMAP, colored by cell type, + split by dataset, RNA 'sd:visualPlugins': - image: - tab: 'Per group' - Caption: 'Split by grouping condition clustered cells ATAC UMAP with assigned cell types' + tab: 'Per dataset' + Caption: 'UMAP, colored by cell type, split by dataset, RNA' - umap_spl_cnd_rd_wnnumap_plot_png: + umap_spl_idnt_rd_atacumap_plot_png: type: File? - outputSource: ctype_assign/umap_spl_cnd_rd_wnnumap_plot_png - label: "Split by grouping condition clustered cells WNN UMAP with assigned cell types" + outputSource: ctype_assign/umap_spl_idnt_rd_atacumap_plot_png + label: "UMAP, colored by cell type, split by dataset, ATAC" doc: | - Split by grouping condition cells UMAP with assigned cell types (wnnumap dim. reduction). - PNG format + UMAP, colored by cell type, + split by dataset, ATAC 'sd:visualPlugins': - image: - tab: 'Per group' - Caption: 'Split by grouping condition clustered cells WNN UMAP with assigned cell types' + tab: 'Per dataset' + Caption: 'UMAP, colored by cell type, split by dataset, ATAC' - umap_spl_ph_rd_rnaumap_plot_png: + umap_spl_idnt_rd_wnnumap_plot_png: type: File? - outputSource: ctype_assign/umap_spl_ph_rd_rnaumap_plot_png - label: "Split by cell cycle phase cells RNA UMAP with assigned cell types" + outputSource: ctype_assign/umap_spl_idnt_rd_wnnumap_plot_png + label: "UMAP, colored by cell type, split by dataset, WNN" doc: | - Split by cell cycle phase cells UMAP with assigned cell types (rnaumap dim. reduction). - PNG format + UMAP, colored by cell type, + split by dataset, WNN 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by cell cycle phase cells RNA UMAP with assigned cell types' + Caption: 'UMAP, colored by cell type, split by dataset, WNN' - umap_spl_ph_rd_atacumap_plot_png: + cmp_gr_ctyp_spl_idnt_plot_png: type: File? - outputSource: ctype_assign/umap_spl_ph_rd_atacumap_plot_png - label: "Split by cell cycle phase cells ATAC UMAP with assigned cell types" + outputSource: ctype_assign/cmp_gr_ctyp_spl_idnt_plot_png + label: "Composition plot, colored by cell type, split by dataset, downsampled" doc: | - Split by cell cycle phase cells UMAP with assigned cell types (atacumap dim. reduction). - PNG format + Composition plot, colored by cell + type, split by dataset, downsampled 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by cell cycle phase cells ATAC UMAP with assigned cell types' + Caption: 'Composition plot, colored by cell type, split by dataset, downsampled' - umap_spl_ph_rd_wnnumap_plot_png: + cmp_gr_idnt_spl_ctyp_plot_png: type: File? - outputSource: ctype_assign/umap_spl_ph_rd_wnnumap_plot_png - label: "Split by cell cycle phase cells WNN UMAP with assigned cell types" + outputSource: ctype_assign/cmp_gr_idnt_spl_ctyp_plot_png + label: "Composition plot, colored by dataset, split by cell type, downsampled" doc: | - Split by cell cycle phase cells UMAP with assigned cell types (wnnumap dim. reduction). - PNG format + Composition plot, colored by + dataset, split by cell type, + downsampled 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'Split by cell cycle phase cells WNN UMAP with assigned cell types' + Caption: 'Composition plot, colored by dataset, split by cell type, downsampled' - cmp_gr_ctyp_spl_idnt_plot_png: + cmp_gr_ph_spl_idnt_plot_png: type: File? - outputSource: ctype_assign/cmp_gr_ctyp_spl_idnt_plot_png - label: "Grouped by cell type split by dataset cells composition plot. Downsampled." + outputSource: ctype_assign/cmp_gr_ph_spl_idnt_plot_png + label: "Composition plot, colored by cell cycle phase, split by dataset, downsampled" doc: | - Grouped by cell type split by dataset cells composition plot. Downsampled. - PNG format + Composition plot, colored by + cell cycle phase, split by + dataset, downsampled 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Grouped by cell type split by dataset cells composition plot. Downsampled.' + tab: 'Per dataset' + Caption: 'Composition plot, colored by cell cycle phase, split by dataset, downsampled' - cmp_gr_idnt_spl_ctyp_plot_png: + umap_spl_cnd_rd_rnaumap_plot_png: type: File? - outputSource: ctype_assign/cmp_gr_idnt_spl_ctyp_plot_png - label: "Grouped by dataset split by cell type cells composition plot. Downsampled." + outputSource: ctype_assign/umap_spl_cnd_rd_rnaumap_plot_png + label: "UMAP, colored by cell type, split by grouping condition, RNA" doc: | - Grouped by dataset split by cell type cells composition plot. Downsampled. - PNG format + UMAP, colored by cell type, split + by grouping condition, RNA 'sd:visualPlugins': - image: - tab: 'Per dataset' - Caption: 'Grouped by dataset split by cell type cells composition plot. Downsampled.' + tab: 'Per group' + Caption: 'UMAP, colored by cell type, split by grouping condition, RNA' - cmp_gr_ph_spl_idnt_plot_png: + umap_spl_cnd_rd_atacumap_plot_png: type: File? - outputSource: ctype_assign/cmp_gr_ph_spl_idnt_plot_png - label: "Grouped by cell cycle phase split by dataset cells composition plot. Downsampled." + outputSource: ctype_assign/umap_spl_cnd_rd_atacumap_plot_png + label: "UMAP, colored by cell type, split by grouping condition, ATAC" doc: | - Grouped by cell cycle phase split by dataset cells composition plot. Downsampled. - PNG format + UMAP, colored by cell type, split + by grouping condition, ATAC 'sd:visualPlugins': - image: - tab: 'Per dataset' - Caption: 'Grouped by cell cycle phase split by dataset cells composition plot. Downsampled.' + tab: 'Per group' + Caption: 'UMAP, colored by cell type, split by grouping condition, ATAC' - cmp_gr_ctyp_spl_cnd_plot_png: + umap_spl_cnd_rd_wnnumap_plot_png: type: File? - outputSource: ctype_assign/cmp_gr_ctyp_spl_cnd_plot_png - label: "Grouped by cell type split by condition cells composition plot. Downsampled." + outputSource: ctype_assign/umap_spl_cnd_rd_wnnumap_plot_png + label: "UMAP, colored by cell type, split by grouping condition, WNN" doc: | - Grouped by cell type split by condition cells composition plot. Downsampled. - PNG format + UMAP, colored by cell type, split + by grouping condition, WNN 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'Grouped by cell type split by condition cells composition plot. Downsampled.' + Caption: 'UMAP, colored by cell type, split by grouping condition, WNN' - cmp_gr_cnd_spl_ctyp_plot_png: + cmp_gr_ctyp_spl_cnd_plot_png: type: File? - outputSource: ctype_assign/cmp_gr_cnd_spl_ctyp_plot_png - label: "Grouped by condition split by cell type cells composition plot. Downsampled." + outputSource: ctype_assign/cmp_gr_ctyp_spl_cnd_plot_png + label: "Composition plot, colored by cell type, split by grouping condition, downsampled" doc: | - Grouped by condition split by cell type cells composition plot. Downsampled. - PNG format + Composition plot, colored by cell + type, split by grouping condition, + downsampled 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'Grouped by condition split by cell type cells composition plot. Downsampled.' + Caption: 'Composition plot, colored by cell type, split by grouping condition, downsampled' - cmp_gr_ph_spl_ctyp_plot_png: + cmp_gr_cnd_spl_ctyp_plot_png: type: File? - outputSource: ctype_assign/cmp_gr_ph_spl_ctyp_plot_png - label: "Grouped by cell cycle phase split by cell type cells composition plot. Downsampled." + outputSource: ctype_assign/cmp_gr_cnd_spl_ctyp_plot_png + label: "Composition plot, colored by grouping condition, split by cell type, downsampled" doc: | - Grouped by cell cycle phase split by cell type cells composition plot. Downsampled. - PNG format + Composition plot, colored by + grouping condition, split by + cell type, downsampled 'sd:visualPlugins': - image: - tab: 'Per dataset' - Caption: 'Grouped by cell cycle phase split by cell type cells composition plot. Downsampled.' + tab: 'Per group' + Caption: 'Composition plot, colored by grouping condition, split by cell type, downsampled' xpr_avg_plot_png: type: File? outputSource: ctype_assign/xpr_avg_plot_png - label: "Log normalized scaled average gene expression per cell type" + label: "Gene expression dot plot" doc: | - Log normalized scaled average gene expression per cell type. - PNG format + Gene expression dot plot 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Log normalized scaled average gene expression per cell type' + tab: 'Genes of interest' + Caption: 'Gene expression dot plot' xpr_dnst_plot_png: type: @@ -486,14 +430,13 @@ outputs: - type: array items: File outputSource: ctype_assign/xpr_dnst_plot_png - label: "Log normalized gene expression density per cell type" + label: "Gene expression violin plot" doc: | - Log normalized gene expression density per cell type. - PNG format + Gene expression violin plot 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression density per cell type' + tab: 'Genes of interest' + Caption: 'Gene expression violin plot' xpr_per_cell_rd_rnaumap_plot_png: type: @@ -501,14 +444,13 @@ outputs: - type: array items: File outputSource: ctype_assign/xpr_per_cell_rd_rnaumap_plot_png - label: "Log normalized gene expression on cells RNA UMAP with assigned cell types" + label: "UMAP, gene expression, RNA" doc: | - Log normalized gene expression on cells UMAP with assigned cell types (rnaumap dim. reduction). - PNG format + UMAP, gene expression, RNA 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression on cells RNA UMAP with assigned cell types' + tab: 'Genes of interest' + Caption: 'UMAP, gene expression, RNA' xpr_per_cell_rd_atacumap_plot_png: type: @@ -516,14 +458,13 @@ outputs: - type: array items: File outputSource: ctype_assign/xpr_per_cell_rd_atacumap_plot_png - label: "Log normalized gene expression on cells ATAC UMAP with assigned cell types" + label: "UMAP, gene expression, ATAC" doc: | - Log normalized gene expression on cells UMAP with assigned cell types (atacumap dim. reduction). - PNG format + UMAP, gene expression, ATAC 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression on cells ATAC UMAP with assigned cell types' + tab: 'Genes of interest' + Caption: 'UMAP, gene expression, ATAC' xpr_per_cell_rd_wnnumap_plot_png: type: @@ -531,14 +472,13 @@ outputs: - type: array items: File outputSource: ctype_assign/xpr_per_cell_rd_wnnumap_plot_png - label: "Log normalized gene expression on cells WNN UMAP with assigned cell types" + label: "UMAP, gene expression, WNN" doc: | - Log normalized gene expression on cells UMAP with assigned cell types (wnnumap dim. reduction). - PNG format + UMAP, gene expression, WNN 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression on cells WNN UMAP with assigned cell types' + tab: 'Genes of interest' + Caption: 'UMAP, gene expression, WNN' xpr_per_cell_sgnl_rd_rnaumap_plot_png: type: @@ -546,14 +486,13 @@ outputs: - type: array items: File outputSource: ctype_assign/xpr_per_cell_sgnl_rd_rnaumap_plot_png - label: "Log normalized gene expression density on cells RNA UMAP with assigned cell types" + label: "UMAP, gene expression density, RNA" doc: | - Log normalized gene expression density on cells UMAP with assigned cell types (rnaumap dim. reduction). - PNG format + UMAP, gene expression density, RNA 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression density on cells RNA UMAP with assigned cell types' + tab: 'Genes of interest' + Caption: 'UMAP, gene expression density, RNA' xpr_per_cell_sgnl_rd_atacumap_plot_png: type: @@ -561,14 +500,13 @@ outputs: - type: array items: File outputSource: ctype_assign/xpr_per_cell_sgnl_rd_atacumap_plot_png - label: "Log normalized gene expression density on cells ATAC UMAP with assigned cell types" + label: "UMAP, gene expression density, ATAC" doc: | - Log normalized gene expression density on cells UMAP with assigned cell types (atacumap dim. reduction). - PNG format + UMAP, gene expression density, ATAC 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression density on cells ATAC UMAP with assigned cell types' + tab: 'Genes of interest' + Caption: 'UMAP, gene expression density, ATAC' xpr_per_cell_sgnl_rd_wnnumap_plot_png: type: @@ -576,14 +514,24 @@ outputs: - type: array items: File outputSource: ctype_assign/xpr_per_cell_sgnl_rd_wnnumap_plot_png - label: "Log normalized gene expression density on cells WNN UMAP with assigned cell types" + label: "UMAP, gene expression density, WNN" + doc: | + UMAP, gene expression density, WNN + 'sd:visualPlugins': + - image: + tab: 'Genes of interest' + Caption: 'UMAP, gene expression density, WNN' + + xpr_htmp_plot_png: + type: File? + outputSource: ctype_assign/xpr_htmp_plot_png + label: "Gene expression heatmap" doc: | - Log normalized gene expression density on cells UMAP with assigned cell types (wnnumap dim. reduction). - PNG format + Gene expression heatmap 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression density on cells WNN UMAP with assigned cell types' + tab: 'Heatmap' + Caption: 'Gene expression heatmap' cvrg_plot_png: type: @@ -591,67 +539,53 @@ outputs: - type: array items: File outputSource: ctype_assign/cvrg_plot_png - label: "Tn5 insertion frequency plot around gene" + label: "Fragments coverage" doc: | - Tn5 insertion frequency plot around gene. - PNG format + Fragments coverage 'sd:visualPlugins': - image: tab: 'Genome coverage' - Caption: 'Tn5 insertion frequency plot around gene' - - xpr_htmp_plot_png: - type: File? - outputSource: ctype_assign/xpr_htmp_plot_png - label: "Normalized gene expression heatmap grouped by cell type" - doc: | - Normalized gene expression heatmap grouped by cell type. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Gene expression' - Caption: 'Normalized gene expression heatmap grouped by cell type' + Caption: 'Fragments coverage' gene_markers_tsv: type: File? outputSource: ctype_assign/gene_markers_tsv - label: "Differentially expressed genes between each pair of cell types" + label: "Gene markers per cell type" doc: | - Differentially expressed genes between each pair of cell types. - TSV format + Gene markers per cell type 'sd:visualPlugins': - syncfusiongrid: tab: 'Gene markers' - Title: 'Differentially expressed genes between each pair of cell types' + Title: 'Gene markers per cell type' peak_markers_tsv: type: File? outputSource: ctype_assign/peak_markers_tsv - label: "Differentially accessible peaks between each pair of cell types" + label: "Peak markers per cell type" doc: | - Differentially accessible peaks between each pair of cell types. - TSV format + Peak markers per cell type 'sd:visualPlugins': - syncfusiongrid: - tab: 'Diff. peaks' - Title: 'Differentially accessible peaks between each pair of cell types' + tab: 'Peak markers' + Title: 'Peak markers per cell type' ucsc_cb_html_data: - type: Directory + type: Directory? outputSource: ctype_assign/ucsc_cb_html_data - label: "Directory with UCSC Cellbrowser html data" + label: "UCSC Cell Browser data" doc: | - Directory with UCSC Cellbrowser html data. + Directory with UCSC Cell Browser + data ucsc_cb_html_file: - type: File + type: File? outputSource: ctype_assign/ucsc_cb_html_file - label: "Open in UCSC Cell Browser" + label: "UCSC Cell Browser" doc: | - HTML index file from the directory with UCSC Cellbrowser html data. - 'sd:visualPlugins': + UCSC Cell Browser HTML index file + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" seurat_data_rds: @@ -661,6 +595,13 @@ outputs: doc: | Processed Seurat data in RDS format + seurat_data_scope: + type: File? + outputSource: ctype_assign/seurat_data_scope + label: "Processed Seurat data in SCope compatible loom format" + doc: | + Processed Seurat data in SCope compatible loom format + ctype_assign_stdout_log: type: File outputSource: ctype_assign/stdout_log @@ -695,10 +636,14 @@ steps: valueFrom: $(split_features(self)) identify_diff_genes: identify_diff_genes identify_diff_peaks: identify_diff_peaks - rna_minimum_logfc: rna_minimum_logfc - rna_minimum_pct: rna_minimum_pct - atac_minimum_logfc: atac_minimum_logfc - atac_minimum_pct: atac_minimum_pct + rna_minimum_logfc: + default: 0.25 + rna_minimum_pct: + default: 0.1 + atac_minimum_logfc: + default: 0.25 + atac_minimum_pct: + default: 0.05 only_positive_diff_genes: default: true rna_test_to_use: @@ -709,13 +654,13 @@ steps: default: true export_ucsc_cb: default: true + export_scope_data: + default: true color_theme: color_theme parallel_memory_limit: - source: parallel_memory_limit - valueFrom: $(parseInt(self)) + default: 32 vector_memory_limit: - source: vector_memory_limit - valueFrom: $(parseInt(self)) + default: 96 threads: source: threads valueFrom: $(parseInt(self)) @@ -753,6 +698,7 @@ steps: - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds + - seurat_data_scope - stdout_log - stderr_log @@ -805,4 +751,5 @@ s:creator: doc: | Single-cell Manual Cell Type Assignment - Assigns cell types for clusters based on the provided metadata file. \ No newline at end of file + Assigns cell types for clusters based on + the provided metadata file. \ No newline at end of file From b24eef3d30d66f1716be5698de744f574ee22caf Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 17 Jul 2023 13:18:58 -0400 Subject: [PATCH 057/162] Not important changes --- workflows/sc-rna-reduce.cwl | 42 +++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 92df432b..32a3c038 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -166,6 +166,25 @@ inputs: cycle gene set is not provided. Default: "do not remove" + regress_genes: + type: string? + label: "Regress genes" + default: null + doc: | + Regress expression of the selected genes + as a confounding source of variation. + Default: None + + regress_mito_perc: + type: boolean? + label: "Regress mitochondrial percentage" + default: false + doc: | + Regress the percentage of transcripts + mapped to mitochondrial genes as a + confounding source of variation. + Default: false + datasets_metadata: type: File? label: "Datasets metadata (optional)" @@ -226,29 +245,6 @@ inputs: 'sd:layout': advanced: true - regress_mito_perc: - type: boolean? - label: "Regress mitochondrial percentage" - default: false - doc: | - Regress the percentage of transcripts - mapped to mitochondrial genes as a - confounding source of variation. - Default: false - 'sd:layout': - advanced: true - - regress_genes: - type: string? - label: "Regress genes" - default: null - doc: | - Regress expression of the selected genes - as a confounding source of variation. - Default: None - 'sd:layout': - advanced: true - export_ucsc_cb: type: boolean? default: false From 343df7bdab9b2abc42703b51e9d4b7c7670dbe14 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 18 Jul 2023 16:30:23 -0400 Subject: [PATCH 058/162] Not important changes --- workflows/sc-atac-reduce.cwl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index 36755889..8cd643a8 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -212,6 +212,8 @@ inputs: symbols: - "1" - "2" + - "3" + - "4" default: "1" label: "Cores/CPUs" doc: | From e796111d91bff372ec416e52ce441c8101d6f79c Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 19 Jul 2023 12:21:30 -0400 Subject: [PATCH 059/162] Export PDFs to a compressed folder --- workflows/sc-atac-cluster.cwl | 44 +++++++++++ workflows/sc-atac-reduce.cwl | 60 ++++++++++++++ workflows/sc-ctype-assign.cwl | 82 +++++++++++++++++++ workflows/sc-multiome-filter.cwl | 130 +++++++++++++++++++++++++++++++ workflows/sc-rna-cluster.cwl | 58 ++++++++++++++ workflows/sc-rna-reduce.cwl | 60 ++++++++++++++ workflows/sc-wnn-cluster.cwl | 58 ++++++++++++++ 7 files changed, 492 insertions(+) diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index a3f24fda..e5b9a42a 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -335,6 +335,14 @@ outputs: doc: | Processed Seurat data in RDS format + pdf_plots: + type: File + outputSource: compress_pdf_plots/compressed_folder + label: "Plots in PDF format" + doc: | + Compressed folder with plots + in PDF format + sc_atac_cluster_stdout_log: type: File outputSource: sc_atac_cluster/stdout_log @@ -380,6 +388,8 @@ steps: default: true export_ucsc_cb: default: true + export_pdf_plots: + default: true color_theme: color_theme parallel_memory_limit: default: 32 @@ -398,6 +408,15 @@ steps: - cmp_gr_clst_spl_cnd_res_plot_png - cmp_gr_cnd_spl_clst_res_plot_png - cvrg_res_plot_png + - umap_res_plot_pdf + - slh_res_plot_pdf + - umap_spl_idnt_res_plot_pdf + - cmp_gr_clst_spl_idnt_res_plot_pdf + - cmp_gr_idnt_spl_clst_res_plot_pdf + - umap_spl_cnd_res_plot_pdf + - cmp_gr_clst_spl_cnd_res_plot_pdf + - cmp_gr_cnd_spl_clst_res_plot_pdf + - cvrg_res_plot_pdf - peak_markers_tsv - ucsc_cb_html_data - ucsc_cb_html_file @@ -405,6 +424,31 @@ steps: - stdout_log - stderr_log + pdf_plots: + run: ../tools/files-to-folder.cwl + in: + input_files: + source: + - sc_atac_cluster/umap_res_plot_pdf + - sc_atac_cluster/slh_res_plot_pdf + - sc_atac_cluster/umap_spl_idnt_res_plot_pdf + - sc_atac_cluster/cmp_gr_clst_spl_idnt_res_plot_pdf + - sc_atac_cluster/cmp_gr_idnt_spl_clst_res_plot_pdf + - sc_atac_cluster/umap_spl_cnd_res_plot_pdf + - sc_atac_cluster/cmp_gr_clst_spl_cnd_res_plot_pdf + - sc_atac_cluster/cmp_gr_cnd_spl_clst_res_plot_pdf + - sc_atac_cluster/cvrg_res_plot_pdf + valueFrom: $(self.flat()) + out: + - folder + + compress_pdf_plots: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: pdf_plots/folder + out: + - compressed_folder + $namespaces: s: http://schema.org/ diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index 8cd643a8..943fb06b 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -454,6 +454,14 @@ outputs: doc: | Processed Seurat data in RDS format + pdf_plots: + type: File + outputSource: compress_pdf_plots/compressed_folder + label: "Plots in PDF format" + doc: | + Compressed folder with plots + in PDF format + sc_atac_reduce_stdout_log: type: File outputSource: sc_atac_reduce/stdout_log @@ -501,6 +509,8 @@ steps: verbose: default: true export_ucsc_cb: export_ucsc_cb + export_pdf_plots: + default: true color_theme: color_theme parallel_memory_limit: default: 32 @@ -527,12 +537,62 @@ steps: - umap_gr_cnd_spl_ncls_plot_png - umap_gr_cnd_spl_frip_plot_png - umap_gr_cnd_spl_blck_plot_png + - qc_dim_corr_plot_pdf + - umap_qc_mtrcs_plot_pdf + - umap_plot_pdf + - umap_spl_idnt_plot_pdf + - umap_spl_cnd_plot_pdf + - umap_spl_frgm_plot_pdf + - umap_spl_peak_plot_pdf + - umap_spl_tss_plot_pdf + - umap_spl_ncls_plot_pdf + - umap_spl_frip_plot_pdf + - umap_spl_blck_plot_pdf + - umap_gr_cnd_spl_frgm_plot_pdf + - umap_gr_cnd_spl_peak_plot_pdf + - umap_gr_cnd_spl_tss_plot_pdf + - umap_gr_cnd_spl_ncls_plot_pdf + - umap_gr_cnd_spl_frip_plot_pdf + - umap_gr_cnd_spl_blck_plot_pdf - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds - stdout_log - stderr_log + pdf_plots: + run: ../tools/files-to-folder.cwl + in: + input_files: + source: + - sc_atac_reduce/qc_dim_corr_plot_pdf + - sc_atac_reduce/umap_qc_mtrcs_plot_pdf + - sc_atac_reduce/umap_plot_pdf + - sc_atac_reduce/umap_spl_idnt_plot_pdf + - sc_atac_reduce/umap_spl_cnd_plot_pdf + - sc_atac_reduce/umap_spl_frgm_plot_pdf + - sc_atac_reduce/umap_spl_peak_plot_pdf + - sc_atac_reduce/umap_spl_tss_plot_pdf + - sc_atac_reduce/umap_spl_ncls_plot_pdf + - sc_atac_reduce/umap_spl_frip_plot_pdf + - sc_atac_reduce/umap_spl_blck_plot_pdf + - sc_atac_reduce/umap_gr_cnd_spl_frgm_plot_pdf + - sc_atac_reduce/umap_gr_cnd_spl_peak_plot_pdf + - sc_atac_reduce/umap_gr_cnd_spl_tss_plot_pdf + - sc_atac_reduce/umap_gr_cnd_spl_ncls_plot_pdf + - sc_atac_reduce/umap_gr_cnd_spl_frip_plot_pdf + - sc_atac_reduce/umap_gr_cnd_spl_blck_plot_pdf + valueFrom: $(self.flat()) + out: + - folder + + compress_pdf_plots: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: pdf_plots/folder + out: + - compressed_folder + $namespaces: s: http://schema.org/ diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index 8a9c9dc4..10496f24 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -602,6 +602,14 @@ outputs: doc: | Processed Seurat data in SCope compatible loom format + pdf_plots: + type: File + outputSource: compress_pdf_plots/compressed_folder + label: "Plots in PDF format" + doc: | + Compressed folder with plots + in PDF format + ctype_assign_stdout_log: type: File outputSource: ctype_assign/stdout_log @@ -656,6 +664,8 @@ steps: default: true export_scope_data: default: true + export_pdf_plots: + default: true color_theme: color_theme parallel_memory_limit: default: 32 @@ -693,6 +703,34 @@ steps: - xpr_per_cell_sgnl_rd_wnnumap_plot_png - cvrg_plot_png - xpr_htmp_plot_png + - umap_rd_rnaumap_plot_pdf + - umap_rd_atacumap_plot_pdf + - umap_rd_wnnumap_plot_pdf + - umap_spl_idnt_rd_rnaumap_plot_pdf + - umap_spl_idnt_rd_atacumap_plot_pdf + - umap_spl_idnt_rd_wnnumap_plot_pdf + - umap_spl_cnd_rd_rnaumap_plot_pdf + - umap_spl_cnd_rd_atacumap_plot_pdf + - umap_spl_cnd_rd_wnnumap_plot_pdf + - umap_spl_ph_rd_rnaumap_plot_pdf + - umap_spl_ph_rd_atacumap_plot_pdf + - umap_spl_ph_rd_wnnumap_plot_pdf + - cmp_gr_ctyp_spl_idnt_plot_pdf + - cmp_gr_idnt_spl_ctyp_plot_pdf + - cmp_gr_ph_spl_idnt_plot_pdf + - cmp_gr_ctyp_spl_cnd_plot_pdf + - cmp_gr_cnd_spl_ctyp_plot_pdf + - cmp_gr_ph_spl_ctyp_plot_pdf + - xpr_avg_plot_pdf + - xpr_dnst_plot_pdf + - xpr_per_cell_rd_rnaumap_plot_pdf + - xpr_per_cell_rd_atacumap_plot_pdf + - xpr_per_cell_rd_wnnumap_plot_pdf + - xpr_per_cell_sgnl_rd_rnaumap_plot_pdf + - xpr_per_cell_sgnl_rd_atacumap_plot_pdf + - xpr_per_cell_sgnl_rd_wnnumap_plot_pdf + - cvrg_plot_pdf + - xpr_htmp_plot_pdf - gene_markers_tsv - peak_markers_tsv - ucsc_cb_html_data @@ -702,6 +740,50 @@ steps: - stdout_log - stderr_log + pdf_plots: + run: ../tools/files-to-folder.cwl + in: + input_files: + source: + - ctype_assign/umap_rd_rnaumap_plot_pdf + - ctype_assign/umap_rd_atacumap_plot_pdf + - ctype_assign/umap_rd_wnnumap_plot_pdf + - ctype_assign/umap_spl_idnt_rd_rnaumap_plot_pdf + - ctype_assign/umap_spl_idnt_rd_atacumap_plot_pdf + - ctype_assign/umap_spl_idnt_rd_wnnumap_plot_pdf + - ctype_assign/umap_spl_cnd_rd_rnaumap_plot_pdf + - ctype_assign/umap_spl_cnd_rd_atacumap_plot_pdf + - ctype_assign/umap_spl_cnd_rd_wnnumap_plot_pdf + - ctype_assign/umap_spl_ph_rd_rnaumap_plot_pdf + - ctype_assign/umap_spl_ph_rd_atacumap_plot_pdf + - ctype_assign/umap_spl_ph_rd_wnnumap_plot_pdf + - ctype_assign/cmp_gr_ctyp_spl_idnt_plot_pdf + - ctype_assign/cmp_gr_idnt_spl_ctyp_plot_pdf + - ctype_assign/cmp_gr_ph_spl_idnt_plot_pdf + - ctype_assign/cmp_gr_ctyp_spl_cnd_plot_pdf + - ctype_assign/cmp_gr_cnd_spl_ctyp_plot_pdf + - ctype_assign/cmp_gr_ph_spl_ctyp_plot_pdf + - ctype_assign/xpr_avg_plot_pdf + - ctype_assign/xpr_dnst_plot_pdf + - ctype_assign/xpr_per_cell_rd_rnaumap_plot_pdf + - ctype_assign/xpr_per_cell_rd_atacumap_plot_pdf + - ctype_assign/xpr_per_cell_rd_wnnumap_plot_pdf + - ctype_assign/xpr_per_cell_sgnl_rd_rnaumap_plot_pdf + - ctype_assign/xpr_per_cell_sgnl_rd_atacumap_plot_pdf + - ctype_assign/xpr_per_cell_sgnl_rd_wnnumap_plot_pdf + - ctype_assign/cvrg_plot_pdf + - ctype_assign/xpr_htmp_plot_pdf + valueFrom: $(self.flat()) + out: + - folder + + compress_pdf_plots: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: pdf_plots/folder + out: + - compressed_folder + $namespaces: s: http://schema.org/ diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index 34dc2091..925c7116 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -1097,6 +1097,14 @@ outputs: Example of datasets metadata file in TSV format + pdf_plots: + type: File + outputSource: compress_pdf_plots/compressed_folder + label: "Plots in PDF format" + doc: | + Compressed folder with plots + in PDF format + sc_multiome_filter_stdout_log: type: File outputSource: sc_multiome_filter/stdout_log @@ -1190,6 +1198,8 @@ steps: default: true export_ucsc_cb: default: true + export_pdf_plots: + default: true color_theme: color_theme parallel_memory_limit: default: 32 @@ -1251,6 +1261,58 @@ steps: - fltr_frgm_dnst_spl_cnd_plot_png - fltr_peak_dnst_spl_cnd_plot_png - fltr_blck_dnst_spl_cnd_plot_png + - raw_1_2_qc_mtrcs_pca_plot_pdf + - raw_2_3_qc_mtrcs_pca_plot_pdf + - raw_cells_count_plot_pdf + - raw_umi_dnst_plot_pdf + - raw_gene_dnst_plot_pdf + - raw_gene_umi_plot_pdf + - raw_mito_dnst_plot_pdf + - raw_nvlt_dnst_plot_pdf + - raw_frgm_dnst_plot_pdf + - raw_peak_dnst_plot_pdf + - raw_blck_dnst_plot_pdf + - raw_rna_atac_cnts_plot_pdf + - raw_tss_frgm_plot_pdf + - raw_qc_mtrcs_dnst_plot_pdf + - raw_rnadbl_plot_pdf + - raw_atacdbl_plot_pdf + - raw_vrlpdbl_plot_pdf + - raw_tss_nrch_plot_pdf + - raw_frgm_hist_pdf + - raw_umi_dnst_spl_cnd_plot_pdf + - raw_gene_dnst_spl_cnd_plot_pdf + - raw_mito_dnst_spl_cnd_plot_pdf + - raw_nvlt_dnst_spl_cnd_plot_pdf + - raw_frgm_dnst_spl_cnd_plot_pdf + - raw_peak_dnst_spl_cnd_plot_pdf + - raw_blck_dnst_spl_cnd_plot_pdf + - fltr_1_2_qc_mtrcs_pca_plot_pdf + - fltr_2_3_qc_mtrcs_pca_plot_pdf + - fltr_cells_count_plot_pdf + - fltr_umi_dnst_plot_pdf + - fltr_gene_dnst_plot_pdf + - fltr_gene_umi_plot_pdf + - fltr_mito_dnst_plot_pdf + - fltr_nvlt_dnst_plot_pdf + - fltr_frgm_dnst_plot_pdf + - fltr_peak_dnst_plot_pdf + - fltr_blck_dnst_plot_pdf + - fltr_rna_atac_cnts_plot_pdf + - fltr_rnadbl_plot_pdf + - fltr_atacdbl_plot_pdf + - fltr_vrlpdbl_plot_pdf + - fltr_tss_frgm_plot_pdf + - fltr_qc_mtrcs_dnst_plot_pdf + - fltr_tss_nrch_plot_pdf + - fltr_frgm_hist_pdf + - fltr_umi_dnst_spl_cnd_plot_pdf + - fltr_gene_dnst_spl_cnd_plot_pdf + - fltr_mito_dnst_spl_cnd_plot_pdf + - fltr_nvlt_dnst_spl_cnd_plot_pdf + - fltr_frgm_dnst_spl_cnd_plot_pdf + - fltr_peak_dnst_spl_cnd_plot_pdf + - fltr_blck_dnst_spl_cnd_plot_pdf - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds @@ -1258,6 +1320,74 @@ steps: - stdout_log - stderr_log + pdf_plots: + run: ../tools/files-to-folder.cwl + in: + input_files: + source: + - sc_multiome_filter/raw_1_2_qc_mtrcs_pca_plot_pdf + - sc_multiome_filter/raw_2_3_qc_mtrcs_pca_plot_pdf + - sc_multiome_filter/raw_cells_count_plot_pdf + - sc_multiome_filter/raw_umi_dnst_plot_pdf + - sc_multiome_filter/raw_gene_dnst_plot_pdf + - sc_multiome_filter/raw_gene_umi_plot_pdf + - sc_multiome_filter/raw_mito_dnst_plot_pdf + - sc_multiome_filter/raw_nvlt_dnst_plot_pdf + - sc_multiome_filter/raw_frgm_dnst_plot_pdf + - sc_multiome_filter/raw_peak_dnst_plot_pdf + - sc_multiome_filter/raw_blck_dnst_plot_pdf + - sc_multiome_filter/raw_rna_atac_cnts_plot_pdf + - sc_multiome_filter/raw_tss_frgm_plot_pdf + - sc_multiome_filter/raw_qc_mtrcs_dnst_plot_pdf + - sc_multiome_filter/raw_rnadbl_plot_pdf + - sc_multiome_filter/raw_atacdbl_plot_pdf + - sc_multiome_filter/raw_vrlpdbl_plot_pdf + - sc_multiome_filter/raw_tss_nrch_plot_pdf + - sc_multiome_filter/raw_frgm_hist_pdf + - sc_multiome_filter/raw_umi_dnst_spl_cnd_plot_pdf + - sc_multiome_filter/raw_gene_dnst_spl_cnd_plot_pdf + - sc_multiome_filter/raw_mito_dnst_spl_cnd_plot_pdf + - sc_multiome_filter/raw_nvlt_dnst_spl_cnd_plot_pdf + - sc_multiome_filter/raw_frgm_dnst_spl_cnd_plot_pdf + - sc_multiome_filter/raw_peak_dnst_spl_cnd_plot_pdf + - sc_multiome_filter/raw_blck_dnst_spl_cnd_plot_pdf + - sc_multiome_filter/fltr_1_2_qc_mtrcs_pca_plot_pdf + - sc_multiome_filter/fltr_2_3_qc_mtrcs_pca_plot_pdf + - sc_multiome_filter/fltr_cells_count_plot_pdf + - sc_multiome_filter/fltr_umi_dnst_plot_pdf + - sc_multiome_filter/fltr_gene_dnst_plot_pdf + - sc_multiome_filter/fltr_gene_umi_plot_pdf + - sc_multiome_filter/fltr_mito_dnst_plot_pdf + - sc_multiome_filter/fltr_nvlt_dnst_plot_pdf + - sc_multiome_filter/fltr_frgm_dnst_plot_pdf + - sc_multiome_filter/fltr_peak_dnst_plot_pdf + - sc_multiome_filter/fltr_blck_dnst_plot_pdf + - sc_multiome_filter/fltr_rna_atac_cnts_plot_pdf + - sc_multiome_filter/fltr_rnadbl_plot_pdf + - sc_multiome_filter/fltr_atacdbl_plot_pdf + - sc_multiome_filter/fltr_vrlpdbl_plot_pdf + - sc_multiome_filter/fltr_tss_frgm_plot_pdf + - sc_multiome_filter/fltr_qc_mtrcs_dnst_plot_pdf + - sc_multiome_filter/fltr_tss_nrch_plot_pdf + - sc_multiome_filter/fltr_frgm_hist_pdf + - sc_multiome_filter/fltr_umi_dnst_spl_cnd_plot_pdf + - sc_multiome_filter/fltr_gene_dnst_spl_cnd_plot_pdf + - sc_multiome_filter/fltr_mito_dnst_spl_cnd_plot_pdf + - sc_multiome_filter/fltr_nvlt_dnst_spl_cnd_plot_pdf + - sc_multiome_filter/fltr_frgm_dnst_spl_cnd_plot_pdf + - sc_multiome_filter/fltr_peak_dnst_spl_cnd_plot_pdf + - sc_multiome_filter/fltr_blck_dnst_spl_cnd_plot_pdf + valueFrom: $(self.flat()) + out: + - folder + + compress_pdf_plots: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: pdf_plots/folder + out: + - compressed_folder + $namespaces: s: http://schema.org/ diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index 53adc2ab..18fc341c 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -428,6 +428,14 @@ outputs: doc: | Processed Seurat data in SCope compatible loom format + pdf_plots: + type: File + outputSource: compress_pdf_plots/compressed_folder + label: "Plots in PDF format" + doc: | + Compressed folder with plots + in PDF format + sc_rna_cluster_stdout_log: type: File outputSource: sc_rna_cluster/stdout_log @@ -475,6 +483,8 @@ steps: default: true export_scope_data: default: true + export_pdf_plots: + default: true color_theme: color_theme parallel_memory_limit: default: 32 @@ -500,6 +510,22 @@ steps: - xpr_per_cell_sgnl_plot_png - xpr_dnst_res_plot_png - xpr_htmp_res_plot_png + - umap_res_plot_pdf + - slh_res_plot_pdf + - umap_spl_idnt_res_plot_pdf + - cmp_gr_clst_spl_idnt_res_plot_pdf + - cmp_gr_idnt_spl_clst_res_plot_pdf + - umap_spl_cnd_res_plot_pdf + - cmp_gr_clst_spl_cnd_res_plot_pdf + - cmp_gr_cnd_spl_clst_res_plot_pdf + - umap_spl_ph_res_plot_pdf + - cmp_gr_ph_spl_idnt_plot_pdf + - cmp_gr_ph_spl_clst_res_plot_pdf + - xpr_avg_res_plot_pdf + - xpr_per_cell_plot_pdf + - xpr_per_cell_sgnl_plot_pdf + - xpr_dnst_res_plot_pdf + - xpr_htmp_res_plot_pdf - gene_markers_tsv - ucsc_cb_html_data - ucsc_cb_html_file @@ -508,6 +534,38 @@ steps: - stdout_log - stderr_log + pdf_plots: + run: ../tools/files-to-folder.cwl + in: + input_files: + source: + - sc_rna_cluster/umap_res_plot_pdf + - sc_rna_cluster/slh_res_plot_pdf + - sc_rna_cluster/umap_spl_idnt_res_plot_pdf + - sc_rna_cluster/cmp_gr_clst_spl_idnt_res_plot_pdf + - sc_rna_cluster/cmp_gr_idnt_spl_clst_res_plot_pdf + - sc_rna_cluster/umap_spl_cnd_res_plot_pdf + - sc_rna_cluster/cmp_gr_clst_spl_cnd_res_plot_pdf + - sc_rna_cluster/cmp_gr_cnd_spl_clst_res_plot_pdf + - sc_rna_cluster/umap_spl_ph_res_plot_pdf + - sc_rna_cluster/cmp_gr_ph_spl_idnt_plot_pdf + - sc_rna_cluster/cmp_gr_ph_spl_clst_res_plot_pdf + - sc_rna_cluster/xpr_avg_res_plot_pdf + - sc_rna_cluster/xpr_per_cell_plot_pdf + - sc_rna_cluster/xpr_per_cell_sgnl_plot_pdf + - sc_rna_cluster/xpr_dnst_res_plot_pdf + - sc_rna_cluster/xpr_htmp_res_plot_pdf + valueFrom: $(self.flat()) + out: + - folder + + compress_pdf_plots: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: pdf_plots/folder + out: + - compressed_folder + $namespaces: s: http://schema.org/ diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 32a3c038..9aa7014a 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -524,6 +524,14 @@ outputs: doc: | Processed Seurat data in RDS format + pdf_plots: + type: File + outputSource: compress_pdf_plots/compressed_folder + label: "Plots in PDF format" + doc: | + Compressed folder with plots + in PDF format + sc_rna_reduce_stdout_log: type: File outputSource: sc_rna_reduce/stdout_log @@ -597,6 +605,8 @@ steps: export_ucsc_cb: export_ucsc_cb low_memory: default: true + export_pdf_plots: + default: true color_theme: color_theme parallel_memory_limit: default: 32 @@ -623,12 +633,62 @@ steps: - umap_gr_cnd_spl_mito_plot_png - umap_gr_cnd_spl_umi_plot_png - umap_gr_cnd_spl_gene_plot_png + - elbow_plot_pdf + - qc_dim_corr_plot_pdf + - umap_qc_mtrcs_plot_pdf + - umap_plot_pdf + - umap_spl_ph_plot_pdf + - ccpca_plot_pdf + - umap_spl_mito_plot_pdf + - umap_spl_umi_plot_pdf + - umap_spl_gene_plot_pdf + - umap_spl_idnt_plot_pdf + - ccpca_spl_idnt_plot_pdf + - umap_spl_cnd_plot_pdf + - umap_gr_cnd_spl_ph_plot_pdf + - ccpca_spl_cnd_plot_pdf + - umap_gr_cnd_spl_mito_plot_pdf + - umap_gr_cnd_spl_umi_plot_pdf + - umap_gr_cnd_spl_gene_plot_pdf - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds - stdout_log - stderr_log + pdf_plots: + run: ../tools/files-to-folder.cwl + in: + input_files: + source: + - sc_rna_reduce/elbow_plot_pdf + - sc_rna_reduce/qc_dim_corr_plot_pdf + - sc_rna_reduce/umap_qc_mtrcs_plot_pdf + - sc_rna_reduce/umap_plot_pdf + - sc_rna_reduce/umap_spl_ph_plot_pdf + - sc_rna_reduce/ccpca_plot_pdf + - sc_rna_reduce/umap_spl_mito_plot_pdf + - sc_rna_reduce/umap_spl_umi_plot_pdf + - sc_rna_reduce/umap_spl_gene_plot_pdf + - sc_rna_reduce/umap_spl_idnt_plot_pdf + - sc_rna_reduce/ccpca_spl_idnt_plot_pdf + - sc_rna_reduce/umap_spl_cnd_plot_pdf + - sc_rna_reduce/umap_gr_cnd_spl_ph_plot_pdf + - sc_rna_reduce/ccpca_spl_cnd_plot_pdf + - sc_rna_reduce/umap_gr_cnd_spl_mito_plot_pdf + - sc_rna_reduce/umap_gr_cnd_spl_umi_plot_pdf + - sc_rna_reduce/umap_gr_cnd_spl_gene_plot_pdf + valueFrom: $(self.flat()) + out: + - folder + + compress_pdf_plots: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: pdf_plots/folder + out: + - compressed_folder + $namespaces: s: http://schema.org/ diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index 62c017d3..a87ec894 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -490,6 +490,14 @@ outputs: Only not normalized raw counts from the RNA assay will be saved + pdf_plots: + type: File + outputSource: compress_pdf_plots/compressed_folder + label: "Plots in PDF format" + doc: | + Compressed folder with plots + in PDF format + sc_wnn_cluster_stdout_log: type: File outputSource: sc_wnn_cluster/stdout_log @@ -545,6 +553,8 @@ steps: default: true export_scope_data: default: true + export_pdf_plots: + default: true color_theme: color_theme parallel_memory_limit: default: 32 @@ -570,6 +580,22 @@ steps: - xpr_dnst_res_plot_png - cvrg_res_plot_png - xpr_htmp_res_plot_png + - umap_res_plot_pdf + - umap_spl_idnt_res_plot_pdf + - cmp_gr_clst_spl_idnt_res_plot_pdf + - cmp_gr_idnt_spl_clst_res_plot_pdf + - umap_spl_cnd_res_plot_pdf + - cmp_gr_clst_spl_cnd_res_plot_pdf + - cmp_gr_cnd_spl_clst_res_plot_pdf + - umap_spl_ph_res_plot_pdf + - cmp_gr_ph_spl_idnt_plot_pdf + - cmp_gr_ph_spl_clst_res_plot_pdf + - xpr_avg_res_plot_pdf + - xpr_per_cell_plot_pdf + - xpr_per_cell_sgnl_plot_pdf + - xpr_dnst_res_plot_pdf + - cvrg_res_plot_pdf + - xpr_htmp_res_plot_pdf - gene_markers_tsv - peak_markers_tsv - ucsc_cb_html_data @@ -579,6 +605,38 @@ steps: - stdout_log - stderr_log + pdf_plots: + run: ../tools/files-to-folder.cwl + in: + input_files: + source: + - sc_wnn_cluster/umap_res_plot_pdf + - sc_wnn_cluster/umap_spl_idnt_res_plot_pdf + - sc_wnn_cluster/cmp_gr_clst_spl_idnt_res_plot_pdf + - sc_wnn_cluster/cmp_gr_idnt_spl_clst_res_plot_pdf + - sc_wnn_cluster/umap_spl_cnd_res_plot_pdf + - sc_wnn_cluster/cmp_gr_clst_spl_cnd_res_plot_pdf + - sc_wnn_cluster/cmp_gr_cnd_spl_clst_res_plot_pdf + - sc_wnn_cluster/umap_spl_ph_res_plot_pdf + - sc_wnn_cluster/cmp_gr_ph_spl_idnt_plot_pdf + - sc_wnn_cluster/cmp_gr_ph_spl_clst_res_plot_pdf + - sc_wnn_cluster/xpr_avg_res_plot_pdf + - sc_wnn_cluster/xpr_per_cell_plot_pdf + - sc_wnn_cluster/xpr_per_cell_sgnl_plot_pdf + - sc_wnn_cluster/xpr_dnst_res_plot_pdf + - sc_wnn_cluster/cvrg_res_plot_pdf + - sc_wnn_cluster/xpr_htmp_res_plot_pdf + valueFrom: $(self.flat()) + out: + - folder + + compress_pdf_plots: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: pdf_plots/folder + out: + - compressed_folder + $namespaces: s: http://schema.org/ From 5800f42262c6fa7725242ec9004cb25366b10714 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 19 Jul 2023 14:58:04 -0400 Subject: [PATCH 060/162] Remove null from the array of PDFs --- workflows/sc-atac-cluster.cwl | 2 +- workflows/sc-atac-reduce.cwl | 2 +- workflows/sc-ctype-assign.cwl | 2 +- workflows/sc-multiome-filter.cwl | 2 +- workflows/sc-rna-cluster.cwl | 2 +- workflows/sc-rna-reduce.cwl | 2 +- workflows/sc-wnn-cluster.cwl | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index e5b9a42a..08904ee8 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -438,7 +438,7 @@ steps: - sc_atac_cluster/cmp_gr_clst_spl_cnd_res_plot_pdf - sc_atac_cluster/cmp_gr_cnd_spl_clst_res_plot_pdf - sc_atac_cluster/cvrg_res_plot_pdf - valueFrom: $(self.flat()) + valueFrom: $(self.flat().filter(n => n)) out: - folder diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index 943fb06b..6e57f915 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -582,7 +582,7 @@ steps: - sc_atac_reduce/umap_gr_cnd_spl_ncls_plot_pdf - sc_atac_reduce/umap_gr_cnd_spl_frip_plot_pdf - sc_atac_reduce/umap_gr_cnd_spl_blck_plot_pdf - valueFrom: $(self.flat()) + valueFrom: $(self.flat().filter(n => n)) out: - folder diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index 10496f24..c7c32274 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -773,7 +773,7 @@ steps: - ctype_assign/xpr_per_cell_sgnl_rd_wnnumap_plot_pdf - ctype_assign/cvrg_plot_pdf - ctype_assign/xpr_htmp_plot_pdf - valueFrom: $(self.flat()) + valueFrom: $(self.flat().filter(n => n)) out: - folder diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index 925c7116..2d2dd8aa 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -1377,7 +1377,7 @@ steps: - sc_multiome_filter/fltr_frgm_dnst_spl_cnd_plot_pdf - sc_multiome_filter/fltr_peak_dnst_spl_cnd_plot_pdf - sc_multiome_filter/fltr_blck_dnst_spl_cnd_plot_pdf - valueFrom: $(self.flat()) + valueFrom: $(self.flat().filter(n => n)) out: - folder diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index 18fc341c..fb375d97 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -555,7 +555,7 @@ steps: - sc_rna_cluster/xpr_per_cell_sgnl_plot_pdf - sc_rna_cluster/xpr_dnst_res_plot_pdf - sc_rna_cluster/xpr_htmp_res_plot_pdf - valueFrom: $(self.flat()) + valueFrom: $(self.flat().filter(n => n)) out: - folder diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 9aa7014a..96f004af 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -678,7 +678,7 @@ steps: - sc_rna_reduce/umap_gr_cnd_spl_mito_plot_pdf - sc_rna_reduce/umap_gr_cnd_spl_umi_plot_pdf - sc_rna_reduce/umap_gr_cnd_spl_gene_plot_pdf - valueFrom: $(self.flat()) + valueFrom: $(self.flat().filter(n => n)) out: - folder diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index a87ec894..52c1d18f 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -626,7 +626,7 @@ steps: - sc_wnn_cluster/xpr_dnst_res_plot_pdf - sc_wnn_cluster/cvrg_res_plot_pdf - sc_wnn_cluster/xpr_htmp_res_plot_pdf - valueFrom: $(self.flat()) + valueFrom: $(self.flat().filter(n => n)) out: - folder From 4c618b42dd9c4a66916934cd6c668a169d125389 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 20 Jul 2023 12:31:00 -0400 Subject: [PATCH 061/162] Update all sc workflows to export PDFs to the compressed folder --- tools/files-to-folder.cwl | 6 ++- workflows/sc-atac-cluster.cwl | 2 + workflows/sc-atac-dbinding.cwl | 36 +++++++++++++ workflows/sc-atac-reduce.cwl | 2 + workflows/sc-ctype-assign.cwl | 2 + workflows/sc-multiome-filter.cwl | 2 + workflows/sc-rna-cluster.cwl | 2 + workflows/sc-rna-da-cells.cwl | 48 +++++++++++++++++ workflows/sc-rna-de-pseudobulk.cwl | 52 ++++++++++++++++++ workflows/sc-rna-filter.cwl | 84 ++++++++++++++++++++++++++++++ workflows/sc-rna-reduce.cwl | 2 + workflows/sc-triangulate.cwl | 46 ++++++++++++++++ workflows/sc-wnn-cluster.cwl | 2 + 13 files changed, 285 insertions(+), 1 deletion(-) diff --git a/tools/files-to-folder.cwl b/tools/files-to-folder.cwl index 0783cfcc..9ec68b89 100644 --- a/tools/files-to-folder.cwl +++ b/tools/files-to-folder.cwl @@ -10,13 +10,17 @@ inputs: type: - File[] - File + folder_basename: + type: string? + default: "" outputs: folder: Directory expression: | ${ + var folder_basename = inputs.folder_basename.split('/').slice(-1).join(''); var folder = { "class": "Directory", - "basename": "", + "basename": folder_basename, "listing": [] } var files = []; diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index 08904ee8..c6dbc3ea 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -439,6 +439,8 @@ steps: - sc_atac_cluster/cmp_gr_cnd_spl_clst_res_plot_pdf - sc_atac_cluster/cvrg_res_plot_pdf valueFrom: $(self.flat().filter(n => n)) + folder_basename: + default: "pdf_plots" out: - folder diff --git a/workflows/sc-atac-dbinding.cwl b/workflows/sc-atac-dbinding.cwl index d6aff015..6329d8dc 100644 --- a/workflows/sc-atac-dbinding.cwl +++ b/workflows/sc-atac-dbinding.cwl @@ -619,6 +619,14 @@ outputs: tab: 'Overall' Caption: 'Tag density heatmap around centers of diff. bound sites' + pdf_plots: + type: File + outputSource: compress_pdf_plots/compressed_folder + label: "Plots in PDF format" + doc: | + Compressed folder with plots + in PDF format + sc_atac_dbinding_stdout_log: type: File outputSource: sc_atac_dbinding/stdout_log @@ -667,6 +675,8 @@ steps: minimum_logfc: minimum_logfc verbose: default: true + export_pdf_plots: + default: true parallel_memory_limit: source: parallel_memory_limit valueFrom: $(parseInt(self)) @@ -697,9 +707,35 @@ steps: - second_enrch_bigbed_file - first_enrch_bed_file - second_enrch_bed_file + - umap_rd_rnaumap_plot_pdf + - umap_rd_atacumap_plot_pdf + - umap_rd_wnnumap_plot_pdf + - dbnd_vlcn_plot_pdf - stdout_log - stderr_log + pdf_plots: + run: ../tools/files-to-folder.cwl + in: + input_files: + source: + - sc_atac_dbinding/umap_rd_rnaumap_plot_pdf + - sc_atac_dbinding/umap_rd_atacumap_plot_pdf + - sc_atac_dbinding/umap_rd_wnnumap_plot_pdf + - sc_atac_dbinding/dbnd_vlcn_plot_pdf + valueFrom: $(self.flat().filter(n => n)) + folder_basename: + default: "pdf_plots" + out: + - folder + + compress_pdf_plots: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: pdf_plots/folder + out: + - compressed_folder + add_label_column: run: ../tools/custom-bash.cwl in: diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index 6e57f915..a80672c2 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -583,6 +583,8 @@ steps: - sc_atac_reduce/umap_gr_cnd_spl_frip_plot_pdf - sc_atac_reduce/umap_gr_cnd_spl_blck_plot_pdf valueFrom: $(self.flat().filter(n => n)) + folder_basename: + default: "pdf_plots" out: - folder diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index c7c32274..56518b97 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -774,6 +774,8 @@ steps: - ctype_assign/cvrg_plot_pdf - ctype_assign/xpr_htmp_plot_pdf valueFrom: $(self.flat().filter(n => n)) + folder_basename: + default: "pdf_plots" out: - folder diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index 2d2dd8aa..ea8b3e6b 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -1378,6 +1378,8 @@ steps: - sc_multiome_filter/fltr_peak_dnst_spl_cnd_plot_pdf - sc_multiome_filter/fltr_blck_dnst_spl_cnd_plot_pdf valueFrom: $(self.flat().filter(n => n)) + folder_basename: + default: "pdf_plots" out: - folder diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index fb375d97..f35714e5 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -556,6 +556,8 @@ steps: - sc_rna_cluster/xpr_dnst_res_plot_pdf - sc_rna_cluster/xpr_htmp_res_plot_pdf valueFrom: $(self.flat().filter(n => n)) + folder_basename: + default: "pdf_plots" out: - folder diff --git a/workflows/sc-rna-da-cells.cwl b/workflows/sc-rna-da-cells.cwl index b4d3c910..08cfa767 100644 --- a/workflows/sc-rna-da-cells.cwl +++ b/workflows/sc-rna-da-cells.cwl @@ -342,6 +342,14 @@ outputs: doc: | Processed Seurat data in RDS format + pdf_plots: + type: File + outputSource: compress_pdf_plots/compressed_folder + label: "Plots in PDF format" + doc: | + Compressed folder with plots + in PDF format + da_cells_stdout_log: type: File outputSource: da_cells/stdout_log @@ -378,6 +386,8 @@ steps: default: true export_ucsc_cb: default: true + export_pdf_plots: + default: true color_theme: color_theme parallel_memory_limit: source: parallel_memory_limit @@ -399,12 +409,50 @@ steps: - umap_spl_idnt_rd_rnaumap_da_scr_plot_png - umap_spl_idnt_rd_atacumap_da_scr_plot_png - umap_spl_idnt_rd_wnnumap_da_scr_plot_png + - da_perm_plot_pdf + - umap_rd_rnaumap_res_plot_pdf + - umap_rd_atacumap_res_plot_pdf + - umap_rd_wnnumap_res_plot_pdf + - umap_spl_cnd_rd_rnaumap_res_plot_pdf + - umap_spl_cnd_rd_atacumap_res_plot_pdf + - umap_spl_cnd_rd_wnnumap_res_plot_pdf + - umap_spl_idnt_rd_rnaumap_da_scr_plot_pdf + - umap_spl_idnt_rd_atacumap_da_scr_plot_pdf + - umap_spl_idnt_rd_wnnumap_da_scr_plot_pdf - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds - stdout_log - stderr_log + pdf_plots: + run: ../tools/files-to-folder.cwl + in: + input_files: + source: + - da_cells/da_perm_plot_pdf + - da_cells/umap_rd_rnaumap_res_plot_pdf + - da_cells/umap_rd_atacumap_res_plot_pdf + - da_cells/umap_rd_wnnumap_res_plot_pdf + - da_cells/umap_spl_cnd_rd_rnaumap_res_plot_pdf + - da_cells/umap_spl_cnd_rd_atacumap_res_plot_pdf + - da_cells/umap_spl_cnd_rd_wnnumap_res_plot_pdf + - da_cells/umap_spl_idnt_rd_rnaumap_da_scr_plot_pdf + - da_cells/umap_spl_idnt_rd_atacumap_da_scr_plot_pdf + - da_cells/umap_spl_idnt_rd_wnnumap_da_scr_plot_pdf + valueFrom: $(self.flat().filter(n => n)) + folder_basename: + default: "pdf_plots" + out: + - folder + + compress_pdf_plots: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: pdf_plots/folder + out: + - compressed_folder + $namespaces: s: http://schema.org/ diff --git a/workflows/sc-rna-de-pseudobulk.cwl b/workflows/sc-rna-de-pseudobulk.cwl index 0ba9cc2c..cceefc7b 100644 --- a/workflows/sc-rna-de-pseudobulk.cwl +++ b/workflows/sc-rna-de-pseudobulk.cwl @@ -589,6 +589,14 @@ outputs: Filtered normalized reads counts per cell. GCT format + pdf_plots: + type: File + outputSource: compress_pdf_plots/compressed_folder + label: "Plots in PDF format" + doc: | + Compressed folder with plots + in PDF format + de_pseudobulk_stdout_log: type: File outputSource: de_pseudobulk/stdout_log @@ -652,6 +660,8 @@ steps: row_distance: row_distance column_distance: column_distance center_row: center_row + export_pdf_plots: + default: true color_theme: color_theme verbose: default: true @@ -677,6 +687,18 @@ steps: - xpr_per_cell_rd_atacumap_plot_png - xpr_per_cell_rd_wnnumap_plot_png - xpr_htmp_plot_png + - umap_rd_rnaumap_plot_pdf + - umap_rd_atacumap_plot_pdf + - umap_rd_wnnumap_plot_pdf + - mds_plot_html + - pca_1_2_plot_pdf + - pca_2_3_plot_pdf + - dxpr_vlcn_plot_pdf + - xpr_dnst_plot_pdf + - xpr_per_cell_rd_rnaumap_plot_pdf + - xpr_per_cell_rd_atacumap_plot_pdf + - xpr_per_cell_rd_wnnumap_plot_pdf + - xpr_htmp_plot_pdf - diff_expr_genes - bulk_read_counts_gct - bulk_phenotypes_cls @@ -684,6 +706,36 @@ steps: - stdout_log - stderr_log + pdf_plots: + run: ../tools/files-to-folder.cwl + in: + input_files: + source: + - de_pseudobulk/umap_rd_rnaumap_plot_pdf + - de_pseudobulk/umap_rd_atacumap_plot_pdf + - de_pseudobulk/umap_rd_wnnumap_plot_pdf + - de_pseudobulk/mds_plot_html + - de_pseudobulk/pca_1_2_plot_pdf + - de_pseudobulk/pca_2_3_plot_pdf + - de_pseudobulk/dxpr_vlcn_plot_pdf + - de_pseudobulk/xpr_dnst_plot_pdf + - de_pseudobulk/xpr_per_cell_rd_rnaumap_plot_pdf + - de_pseudobulk/xpr_per_cell_rd_atacumap_plot_pdf + - de_pseudobulk/xpr_per_cell_rd_wnnumap_plot_pdf + - de_pseudobulk/xpr_htmp_plot_pdf + valueFrom: $(self.flat().filter(n => n)) + folder_basename: + default: "pdf_plots" + out: + - folder + + compress_pdf_plots: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: pdf_plots/folder + out: + - compressed_folder + morpheus_heatmap: run: ../tools/morpheus-heatmap.cwl in: diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index c088145e..96c77dbe 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -614,6 +614,14 @@ outputs: Example of datasets metadata file in TSV format + pdf_plots: + type: File + outputSource: compress_pdf_plots/compressed_folder + label: "Plots in PDF format" + doc: | + Compressed folder with plots + in PDF format + sc_rna_filter_stdout_log: type: File outputSource: sc_rna_filter/stdout_log @@ -676,6 +684,8 @@ steps: default: true export_ucsc_cb: default: true + export_pdf_plots: + default: true color_theme: color_theme parallel_memory_limit: source: parallel_memory_limit @@ -715,6 +725,34 @@ steps: - fltr_gene_dnst_spl_cnd_plot_png - fltr_mito_dnst_spl_cnd_plot_png - fltr_nvlt_dnst_spl_cnd_plot_png + - raw_1_2_qc_mtrcs_pca_plot_pdf + - raw_2_3_qc_mtrcs_pca_plot_pdf + - raw_cells_count_plot_pdf + - raw_umi_dnst_plot_pdf + - raw_gene_dnst_plot_pdf + - raw_gene_umi_plot_pdf + - raw_mito_dnst_plot_pdf + - raw_nvlt_dnst_plot_pdf + - raw_qc_mtrcs_dnst_plot_pdf + - raw_rnadbl_plot_pdf + - raw_umi_dnst_spl_cnd_plot_pdf + - raw_gene_dnst_spl_cnd_plot_pdf + - raw_mito_dnst_spl_cnd_plot_pdf + - raw_nvlt_dnst_spl_cnd_plot_pdf + - fltr_1_2_qc_mtrcs_pca_plot_pdf + - fltr_2_3_qc_mtrcs_pca_plot_pdf + - fltr_cells_count_plot_pdf + - fltr_umi_dnst_plot_pdf + - fltr_gene_dnst_plot_pdf + - fltr_gene_umi_plot_pdf + - fltr_mito_dnst_plot_pdf + - fltr_nvlt_dnst_plot_pdf + - fltr_qc_mtrcs_dnst_plot_pdf + - fltr_rnadbl_plot_pdf + - fltr_umi_dnst_spl_cnd_plot_pdf + - fltr_gene_dnst_spl_cnd_plot_pdf + - fltr_mito_dnst_spl_cnd_plot_pdf + - fltr_nvlt_dnst_spl_cnd_plot_pdf - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds @@ -722,6 +760,52 @@ steps: - stdout_log - stderr_log + pdf_plots: + run: ../tools/files-to-folder.cwl + in: + input_files: + source: + - sc_rna_filter/raw_1_2_qc_mtrcs_pca_plot_pdf + - sc_rna_filter/raw_2_3_qc_mtrcs_pca_plot_pdf + - sc_rna_filter/raw_cells_count_plot_pdf + - sc_rna_filter/raw_umi_dnst_plot_pdf + - sc_rna_filter/raw_gene_dnst_plot_pdf + - sc_rna_filter/raw_gene_umi_plot_pdf + - sc_rna_filter/raw_mito_dnst_plot_pdf + - sc_rna_filter/raw_nvlt_dnst_plot_pdf + - sc_rna_filter/raw_qc_mtrcs_dnst_plot_pdf + - sc_rna_filter/raw_rnadbl_plot_pdf + - sc_rna_filter/raw_umi_dnst_spl_cnd_plot_pdf + - sc_rna_filter/raw_gene_dnst_spl_cnd_plot_pdf + - sc_rna_filter/raw_mito_dnst_spl_cnd_plot_pdf + - sc_rna_filter/raw_nvlt_dnst_spl_cnd_plot_pdf + - sc_rna_filter/fltr_1_2_qc_mtrcs_pca_plot_pdf + - sc_rna_filter/fltr_2_3_qc_mtrcs_pca_plot_pdf + - sc_rna_filter/fltr_cells_count_plot_pdf + - sc_rna_filter/fltr_umi_dnst_plot_pdf + - sc_rna_filter/fltr_gene_dnst_plot_pdf + - sc_rna_filter/fltr_gene_umi_plot_pdf + - sc_rna_filter/fltr_mito_dnst_plot_pdf + - sc_rna_filter/fltr_nvlt_dnst_plot_pdf + - sc_rna_filter/fltr_qc_mtrcs_dnst_plot_pdf + - sc_rna_filter/fltr_rnadbl_plot_pdf + - sc_rna_filter/fltr_umi_dnst_spl_cnd_plot_pdf + - sc_rna_filter/fltr_gene_dnst_spl_cnd_plot_pdf + - sc_rna_filter/fltr_mito_dnst_spl_cnd_plot_pdf + - sc_rna_filter/fltr_nvlt_dnst_spl_cnd_plot_pdf + valueFrom: $(self.flat().filter(n => n)) + folder_basename: + default: "pdf_plots" + out: + - folder + + compress_pdf_plots: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: pdf_plots/folder + out: + - compressed_folder + $namespaces: s: http://schema.org/ diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 96f004af..b93ee22a 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -679,6 +679,8 @@ steps: - sc_rna_reduce/umap_gr_cnd_spl_umi_plot_pdf - sc_rna_reduce/umap_gr_cnd_spl_gene_plot_pdf valueFrom: $(self.flat().filter(n => n)) + folder_basename: + default: "pdf_plots" out: - folder diff --git a/workflows/sc-triangulate.cwl b/workflows/sc-triangulate.cwl index 33166383..67db96f9 100644 --- a/workflows/sc-triangulate.cwl +++ b/workflows/sc-triangulate.cwl @@ -281,6 +281,14 @@ outputs: doc: | Processed Seurat data in RDS format + pdf_plots: + type: File + outputSource: compress_pdf_plots/compressed_folder + label: "Plots in PDF format" + doc: | + Compressed folder with plots + in PDF format + triangulate_stdout_log: type: File outputSource: triangulate/stdout_log @@ -310,6 +318,8 @@ steps: default: true export_ucsc_cb: default: true + export_pdf_plots: + default: true color_theme: color_theme parallel_memory_limit: source: parallel_memory_limit @@ -330,12 +340,48 @@ steps: - umap_tric_rd_rnaumap_plot_png - umap_tric_rd_atacumap_plot_png - umap_tric_rd_wnnumap_plot_png + - umap_tril_rd_rnaumap_plot_pdf + - umap_tril_rd_atacumap_plot_pdf + - umap_tril_rd_wnnumap_plot_pdf + - umap_tria_rd_rnaumap_plot_pdf + - umap_tria_rd_atacumap_plot_pdf + - umap_tria_rd_wnnumap_plot_pdf + - umap_tric_rd_rnaumap_plot_pdf + - umap_tric_rd_atacumap_plot_pdf + - umap_tric_rd_wnnumap_plot_pdf - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds - stdout_log - stderr_log + pdf_plots: + run: ../tools/files-to-folder.cwl + in: + input_files: + source: + - triangulate/umap_tril_rd_rnaumap_plot_pdf + - triangulate/umap_tril_rd_atacumap_plot_pdf + - triangulate/umap_tril_rd_wnnumap_plot_pdf + - triangulate/umap_tria_rd_rnaumap_plot_pdf + - triangulate/umap_tria_rd_atacumap_plot_pdf + - triangulate/umap_tria_rd_wnnumap_plot_pdf + - triangulate/umap_tric_rd_rnaumap_plot_pdf + - triangulate/umap_tric_rd_atacumap_plot_pdf + - triangulate/umap_tric_rd_wnnumap_plot_pdf + valueFrom: $(self.flat().filter(n => n)) + folder_basename: + default: "pdf_plots" + out: + - folder + + compress_pdf_plots: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: pdf_plots/folder + out: + - compressed_folder + $namespaces: s: http://schema.org/ diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index 52c1d18f..8caa52de 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -627,6 +627,8 @@ steps: - sc_wnn_cluster/cvrg_res_plot_pdf - sc_wnn_cluster/xpr_htmp_res_plot_pdf valueFrom: $(self.flat().filter(n => n)) + folder_basename: + default: "pdf_plots" out: - folder From 954acf825d81e8138812670f74a0096203c427ac Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 21 Jul 2023 16:20:35 -0400 Subject: [PATCH 062/162] Call MACS2 peaks for sc multiome filter --- tools/sc-atac-cluster.cwl | 2 +- tools/sc-atac-coverage.cwl | 2 +- tools/sc-atac-dbinding.cwl | 2 +- tools/sc-atac-reduce.cwl | 2 +- tools/sc-ctype-assign.cwl | 2 +- tools/sc-multiome-filter.cwl | 69 ++++++++++++++++++++------------ tools/sc-rna-cluster.cwl | 2 +- tools/sc-rna-da-cells.cwl | 2 +- tools/sc-rna-de-pseudobulk.cwl | 2 +- tools/sc-rna-filter.cwl | 2 +- tools/sc-rna-reduce.cwl | 2 +- tools/sc-triangulate.cwl | 2 +- tools/sc-wnn-cluster.cwl | 2 +- workflows/sc-multiome-filter.cwl | 42 +++++++++++++++++++ 14 files changed, 98 insertions(+), 37 deletions(-) diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index f3fa297b..180c28ff 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.25 + dockerPull: biowardrobe2/sc-tools:v0.0.26 inputs: diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index c7ab53e9..42a919f9 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.25 + dockerPull: biowardrobe2/sc-tools:v0.0.26 inputs: diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index ac912288..7698b2c0 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.25 + dockerPull: biowardrobe2/sc-tools:v0.0.26 inputs: diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index 7911474f..144c8e8a 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.25 + dockerPull: biowardrobe2/sc-tools:v0.0.26 inputs: diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index 3fb6c3d0..acda4c3b 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.25 + dockerPull: biowardrobe2/sc-tools:v0.0.26 inputs: diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index 14f59657..98926a6b 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.25 + dockerPull: biowardrobe2/sc-tools:v0.0.26 inputs: @@ -289,6 +289,14 @@ inputs: scores filters. Default: do not call peaks + minimum_qvalue: + type: float? + inputBinding: + prefix: "--qvalue" + doc: | + Minimum FDR (q-value) cutoff for MACS2 peak detection. + Ignored if --callby is not provided. Default: 0.05 + remove_doublets: type: - "null" @@ -1824,30 +1832,36 @@ doc: | s:about: | - usage: sc_multiome_filter.R [-h] --mex MEX --identity IDENTITY --fragments - FRAGMENTS --annotations ANNOTATIONS --seqinfo - SEQINFO [--grouping GROUPING] - [--blacklist BLACKLIST] [--barcodes BARCODES] - [--rnamincells RNAMINCELLS] - [--mingenes [MINGENES [MINGENES ...]]] - [--maxgenes [MAXGENES [MAXGENES ...]]] - [--minumis [MINUMIS [MINUMIS ...]]] - [--mitopattern MITOPATTERN] [--maxmt MAXMT] - [--minnovelty [MINNOVELTY [MINNOVELTY ...]]] - [--atacmincells ATACMINCELLS] - [--minfragments [MINFRAGMENTS [MINFRAGMENTS ...]]] - [--maxnuclsignal [MAXNUCLSIGNAL [MAXNUCLSIGNAL ...]]] - [--mintssenrich [MINTSSENRICH [MINTSSENRICH ...]]] - [--minfrip [MINFRIP [MINFRIP ...]]] - [--maxblacklist [MAXBLACKLIST [MAXBLACKLIST ...]]] - [--callby CALLBY] - [--removedoublets {union,onlyrna,onlyatac,intersect}] - [--rnadbr RNADBR] [--rnadbrsd RNADBRSD] - [--atacdbr ATACDBR] [--atacdbrsd ATACDBRSD] - [--pdf] [--verbose] [--h5seurat] [--h5ad] - [--cbbuild] [--output OUTPUT] - [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] - [--cpus CPUS] [--memory MEMORY] + usage: sc_multiome_filter.R [-h] --mex MEX --identity IDENTITY + --fragments FRAGMENTS --annotations + ANNOTATIONS --seqinfo SEQINFO + [--grouping GROUPING] + [--blacklist BLACKLIST] + [--barcodes BARCODES] + [--rnamincells RNAMINCELLS] + [--mingenes [MINGENES [MINGENES ...]]] + [--maxgenes [MAXGENES [MAXGENES ...]]] + [--minumis [MINUMIS [MINUMIS ...]]] + [--mitopattern MITOPATTERN] + [--maxmt MAXMT] + [--minnovelty [MINNOVELTY [MINNOVELTY ...]]] + [--atacmincells ATACMINCELLS] + [--minfragments [MINFRAGMENTS [MINFRAGMENTS ...]]] + [--maxnuclsignal [MAXNUCLSIGNAL [MAXNUCLSIGNAL ...]]] + [--mintssenrich [MINTSSENRICH [MINTSSENRICH ...]]] + [--minfrip [MINFRIP [MINFRIP ...]]] + [--maxblacklist [MAXBLACKLIST [MAXBLACKLIST ...]]] + [--callby CALLBY] [--qvalue QVALUE] + [--removedoublets {union,onlyrna,onlyatac,intersect}] + [--rnadbr RNADBR] + [--rnadbrsd RNADBRSD] + [--atacdbr ATACDBR] + [--atacdbrsd ATACDBRSD] [--pdf] + [--verbose] [--h5seurat] [--h5ad] + [--cbbuild] [--tmpdir TMPDIR] + [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] Single-cell Multiome ATAC and RNA-Seq Filtering Analysis @@ -1970,6 +1984,8 @@ s:about: | only after applying all RNA related thresholds, maximum nucleosome signal, and minimum TSS enrichment scores filters. Default: do not call peaks + --qvalue QVALUE Minimum FDR (q-value) cutoff for MACS2 peak detection. + Ignored if --callby is not provided. Default: 0.05 --removedoublets {union,onlyrna,onlyatac,intersect} Remove cells that were identified as doublets. For RNA assay cells with UMI < 200 will not be evaluated. @@ -1995,6 +2011,9 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save Seurat data to h5ad file. Default: false --cbbuild Export results to UCSC Cell Browser. Default: false + --tmpdir TMPDIR Directory to keep temporary files. Default: either + /tmp or defined by environment variables TMPDIR, TMP, + TEMP. --output OUTPUT Output prefix. Default: ./sc --theme {gray,bw,linedraw,light,dark,minimal,classic,void} Color theme for all generated plots. Default: classic diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index b6fc7399..19ccc39f 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.25 + dockerPull: biowardrobe2/sc-tools:v0.0.26 inputs: diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index 1d3ac8e8..f543902e 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.25 + dockerPull: biowardrobe2/sc-tools:v0.0.26 inputs: diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index d465e37b..7a4ec18e 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.25 + dockerPull: biowardrobe2/sc-tools:v0.0.26 inputs: diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index f7c0cfe4..37a04614 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.25 + dockerPull: biowardrobe2/sc-tools:v0.0.26 inputs: diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index 9302fdda..ec10de08 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.25 + dockerPull: biowardrobe2/sc-tools:v0.0.26 inputs: diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index 4a1a3987..ef50b511 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.25 + dockerPull: biowardrobe2/sc-tools:v0.0.26 inputs: diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index bcee379e..6ce53030 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.25 + dockerPull: biowardrobe2/sc-tools:v0.0.26 inputs: diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index ea8b3e6b..5d75eaba 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -102,6 +102,35 @@ inputs: utilized in the current or future steps of analysis. + call_by: + type: string? + default: "" + label: "Cells grouping for MACS2 peak calling" + doc: | + Single cell metadata column to be used + for cells grouping before using MACS2 + to replace 10x peaks with the new ones. + To group cells by dataset, use "dataset". + Custom groups can be defined based on + any single cell metadata added through + the "Selected cell barcodes (optional)" + input. Default: use the original peaks + generated by Cell Ranger ARC. + 'sd:layout': + advanced: true + + minimum_qvalue: + type: float? + default: 0.05 + label: "Minimum MACS2 FDR" + doc: | + Minimum FDR (q-value) cutoff for MACS2 peak + detection. Ignored if "Cells grouping for + MACS2 peak calling" input is not provided. + Default: 0.05 + 'sd:layout': + advanced: true + remove_doublets: type: - type: enum @@ -1145,6 +1174,19 @@ steps: grouping_data: grouping_data blacklist_regions_file: blacklist_regions_file barcodes_data: barcodes_data + call_by: + source: call_by + valueFrom: | + ${ + if (self == "dataset") { + return "new.ident"; + } else if (self == "") { + return null; + } else { + return self; + } + } + minimum_qvalue: minimum_qvalue rna_minimum_cells: default: 1 # will remove genes that are not expressed in any of the cells minimum_genes: From 62d3b88eae97e5472eb5904615e2cc79370771ce Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 25 Jul 2023 15:42:48 -0400 Subject: [PATCH 063/162] Refactor sc rna diff expr workflow --- tools/sc-rna-de-pseudobulk.cwl | 114 +++-- workflows/sc-rna-de-pseudobulk.cwl | 748 ++++++++++++++--------------- 2 files changed, 413 insertions(+), 449 deletions(-) diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index 7a4ec18e..df4f9936 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -330,9 +330,9 @@ outputs: outputBinding: glob: "*_umap_rd_rnaumap.png" doc: | - Cells UMAP split by selected criteria, + UMAP, split by selected criteria, optionally subsetted to the specific - group (rnaumap dim. reduction). + group, RNA. PNG format umap_rd_rnaumap_plot_pdf: @@ -340,9 +340,9 @@ outputs: outputBinding: glob: "*_umap_rd_rnaumap.pdf" doc: | - Cells UMAP split by selected criteria, + UMAP, split by selected criteria, optionally subsetted to the specific - group (rnaumap dim. reduction). + group, RNA. PDF format umap_rd_atacumap_plot_png: @@ -350,9 +350,9 @@ outputs: outputBinding: glob: "*_umap_rd_atacumap.png" doc: | - Cells UMAP split by selected criteria, + UMAP, split by selected criteria, optionally subsetted to the specific - group (atacumap dim. reduction). + group, ATAC. PNG format umap_rd_atacumap_plot_pdf: @@ -360,9 +360,9 @@ outputs: outputBinding: glob: "*_umap_rd_atacumap.pdf" doc: | - Cells UMAP split by selected criteria, + UMAP, split by selected criteria, optionally subsetted to the specific - group (atacumap dim. reduction). + group, ATAC. PDF format umap_rd_wnnumap_plot_png: @@ -370,9 +370,9 @@ outputs: outputBinding: glob: "*_umap_rd_wnnumap.png" doc: | - Cells UMAP split by selected criteria, + UMAP, split by selected criteria, optionally subsetted to the specific - group (wnnumap dim. reduction). + group, WNN. PNG format umap_rd_wnnumap_plot_pdf: @@ -380,9 +380,9 @@ outputs: outputBinding: glob: "*_umap_rd_wnnumap.pdf" doc: | - Cells UMAP split by selected criteria, + UMAP, split by selected criteria, optionally subsetted to the specific - group (wnnumap dim. reduction). + group, WNN. PDF format mds_plot_html: @@ -391,7 +391,7 @@ outputs: glob: "*_mds_plot.html" doc: | MDS plot of pseudobulk aggregated - normalized reads counts. All genes. + normalized reads counts. HTML format pca_1_2_plot_png: @@ -399,8 +399,7 @@ outputs: outputBinding: glob: "*_pca_1_2.png" doc: | - Normalized reads counts PCA (1, 2). - All genes. + Gene expression PCA (1,2). PNG format pca_1_2_plot_pdf: @@ -408,8 +407,7 @@ outputs: outputBinding: glob: "*_pca_1_2.pdf" doc: | - Normalized reads counts PCA (1, 2). - All genes. + Gene expression PCA (1,2). PDF format pca_2_3_plot_png: @@ -417,8 +415,7 @@ outputs: outputBinding: glob: "*_pca_2_3.png" doc: | - Normalized reads counts PCA (2, 3). - All genes. + Gene expression PCA (2,3). PNG format pca_2_3_plot_pdf: @@ -426,8 +423,7 @@ outputs: outputBinding: glob: "*_pca_2_3.pdf" doc: | - Normalized reads counts PCA (2, 3). - All genes. + Gene expression PCA (2,3). PDF format dxpr_vlcn_plot_png: @@ -463,11 +459,11 @@ outputs: outputBinding: glob: "*_xpr_dnst.png" doc: | - Log normalized gene expression density plots for - either user provided or top 10 differentially - expressed genes with the highest log2FoldChange - values. The direction of comparison is defined - as --second vs --first. + Gene expression violin plots for either user + provided or top 10 differentially expressed + genes with the highest log2FoldChange values. + The direction of comparison is defined as + --second vs --first. PNG format xpr_dnst_plot_pdf: @@ -475,11 +471,11 @@ outputs: outputBinding: glob: "*_xpr_dnst.pdf" doc: | - Log normalized gene expression density plots for - either user provided or top 10 differentially - expressed genes with the highest log2FoldChange - values. The direction of comparison is defined - as --second vs --first. + Gene expression violin plots for either user + provided or top 10 differentially expressed + genes with the highest log2FoldChange values. + The direction of comparison is defined as + --second vs --first. PDF format xpr_per_cell_rd_rnaumap_plot_png: @@ -490,9 +486,9 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_rnaumap_*.png" doc: | - Log normalized gene expression on cells UMAP - split by selected criteria, optionally subsetted - to the specific group (rnaumap dim. reduction). + UMAP, gene expression, split by selected + criteria, optionally subsetted to the + specific group, RNA. PNG format xpr_per_cell_rd_rnaumap_plot_pdf: @@ -503,9 +499,9 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_rnaumap_*.pdf" doc: | - Log normalized gene expression on cells UMAP - split by selected criteria, optionally subsetted - to the specific group (rnaumap dim. reduction). + UMAP, gene expression, split by selected + criteria, optionally subsetted to the + specific group, RNA. PDF format xpr_per_cell_rd_atacumap_plot_png: @@ -516,9 +512,9 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_atacumap_*.png" doc: | - Log normalized gene expression on cells UMAP - split by selected criteria, optionally subsetted - to the specific group (atacumap dim. reduction). + UMAP, gene expression, split by selected + criteria, optionally subsetted to the + specific group, ATAC. PNG format xpr_per_cell_rd_atacumap_plot_pdf: @@ -529,9 +525,9 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_atacumap_*.pdf" doc: | - Log normalized gene expression on cells UMAP - split by selected criteria, optionally subsetted - to the specific group (atacumap dim. reduction). + UMAP, gene expression, split by selected + criteria, optionally subsetted to the + specific group, ATAC. PDF format xpr_per_cell_rd_wnnumap_plot_png: @@ -542,9 +538,9 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_wnnumap_*.png" doc: | - Log normalized gene expression on cells UMAP - split by selected criteria, optionally subsetted - to the specific group (wnnumap dim. reduction). + UMAP, gene expression, split by selected + criteria, optionally subsetted to the + specific group, WNN. PNG format xpr_per_cell_rd_wnnumap_plot_pdf: @@ -555,9 +551,9 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_wnnumap_*.pdf" doc: | - Log normalized gene expression on cells UMAP - split by selected criteria, optionally subsetted - to the specific group (wnnumap dim. reduction). + UMAP, gene expression, split by selected + criteria, optionally subsetted to the + specific group, WNN. PDF format xpr_htmp_plot_png: @@ -565,9 +561,9 @@ outputs: outputBinding: glob: "*_xpr_htmp.png" doc: | - Filtered by adjusted P-value normalized gene - expression heatmap per cell optionally subsetted - to the specific group. + Gene expression heatmap, filtered by adjusted + P-value, optionally subsetted to the specific + groups of cells. PNG format xpr_htmp_plot_pdf: @@ -575,9 +571,9 @@ outputs: outputBinding: glob: "*_xpr_htmp.pdf" doc: | - Filtered by adjusted P-value normalized gene - expression heatmap per cell optionally subsetted - to the specific group. + Gene expression heatmap, filtered by adjusted + P-value, optionally subsetted to the specific + groups of cells. PDF format diff_expr_genes: @@ -585,8 +581,8 @@ outputs: outputBinding: glob: "*_de_genes.tsv" doc: | - Differentially expressed genes. Not filtered - by adjusted P-value. + Differentially expressed genes. + Not filtered by adjusted P-value. TSV format bulk_read_counts_gct: @@ -594,8 +590,8 @@ outputs: outputBinding: glob: "*_bulk_counts.gct" doc: | - GSEA compatible not filtered normalized reads - counts aggregated to pseudobulk form. + GSEA compatible not filtered normalized + reads counts aggregated to pseudobulk form. GCT format bulk_phenotypes_cls: diff --git a/workflows/sc-rna-de-pseudobulk.cwl b/workflows/sc-rna-de-pseudobulk.cwl index cceefc7b..8723b34f 100644 --- a/workflows/sc-rna-de-pseudobulk.cwl +++ b/workflows/sc-rna-de-pseudobulk.cwl @@ -29,254 +29,216 @@ inputs: alias: type: string - label: "Experiment short name/alias" + label: "Analysis name" sd:preview: position: 1 query_data_rds: type: File - label: "Single-cell Cluster or Manual Cell Type Assignment Analysis" - doc: | - Single-cell analysis run through the - clustering or cell type assignment - pipelines. + label: "Single-cell Analysis with Clustered RNA-Seq Datasets" + doc: | + Analysis that includes single-cell + multiome RNA and ATAC-Seq or just + RNA-Seq datasets run through either + "Single-cell Manual Cell Type + Assignment", "Single-cell RNA-Seq + Cluster Analysis", or "Single-cell + WNN Cluster Analysis" at any of the + processing stages. 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" 'sd:localLabel': true - datasets_metadata: - type: File? - label: "TSV/CSV file to assign categories per sample" - doc: | - If selected single-cell analysis was run - with the data aggregated from multiple - samples, you can optionally provide tab- - delimited or comma-separated file to - assign additional categories per sample. - First column should be named 'library_id' - and include all sample names from the - selected single-cell analysis regardless - whether filtering by barcodes was applied - or not. All other columns may have - arbitrary names. - - barcodes_data: - type: File? - label: "TSV/CSV file to filter cells by barcodes" - doc: | - Loaded single-cell data can be optionally - prefiltered by selected cell barcodes. - Provided tab-delimited or comma-separated - file should have the first column named - 'barcode'. If this file includes any other - columns, they will be used to assign - additional categories per cell. - groupby: type: string? default: null - label: "Category to group cells for optional subsetting" - doc: | - Before running differential expression - analysis input data can be optionally - prefiltered to include only certain - values from the specific category. - Here we define the name of that - category. + label: "Subsetting category (optional)" + doc: | + Single cell metadata column to group + cells for optional subsetting before + running differential expression analysis. + To group cells by dataset, use "dataset". + Custom groups can be defined based on + any single cell metadata added through + the "Datasets metadata (optional)" or + "Selected cell barcodes (optional)" + inputs. Default: do not subset cells subset: type: string? default: null - label: "List of values to subset cells from the selected category" + label: "Subsetting values (optional)" doc: | - If the category to group cells for - optional subsetting was provided, - here we define which values should - be included into analysis. + Comma or space separated list of values + from the single cell metadata column + selected in "Subsetting category + (optional)" input. Ignored if grouping + category is not provided. Default: do + not subset cells splitby: type: string - label: "Category to split cell into two groups" - doc: | - All remaining after optional prefiltering - steps cells will be split into two groups - for gene expression comparison. + label: "Comparison category" + doc: | + Single cell metadata column to split + cells into two comparison groups before + running differential expression analysis. + To split cells by dataset, use "dataset". + Custom groups can be defined based on + any single cell metadata added through + the "Datasets metadata (optional)" or + "Selected cell barcodes (optional)" + inputs. The direction of comparison is + always "Second comparison group" vs + "First comparison group". first_cond: type: string - label: "Value from the selected category to define the first group of cells" + label: "First comparison group" doc: | - Cells for which the selected category - includes provided value will be used - as the first group for differential - expression comparison. Direction of - comparison is second vs first groups. + Value from the single cell metadata + column selected in "Comparison category" + input to define the first group of cells + for differential expression analysis. second_cond: type: string - label: "Value from the selected category to define the second group of cells" + label: "Second comparison group" doc: | - Cells for which the selected category - includes provided value will be used - as the second group for differential - expression comparison. Direction of - comparison is second vs first groups. + Value from the single cell metadata + column selected in "Comparison category" + input to define the second group of cells + for differential expression analysis. analysis_method: type: - "null" - type: enum symbols: - - "wilcoxon (by cells, no batches)" # (wilcox) Wilcoxon Rank Sum test - - "likelihood-ratio (by cells, no batches)" # (bimod) Likelihood-ratio test - - "t-test (by cells, no batches)" # (t) Student's t-test - - "negative-binomial (by cells, models batches)" # (negbinom) Negative Binomial Generalized Linear Model (supports --batchby) - - "poisson (by cells, models batches)" # (poisson) Poisson Generalized Linear Model (supports --batchby) - - "logistic-regression (by cells, models batches)" # (LR) Logistic Regression (supports --batchby) - - "mast (by cells, models batches)" # (MAST) MAST package (supports --batchby) - - "deseq (pseudo bulk, models batches)" # DESeq2 Wald test on pseudobulk aggregated gene expression - - "deseq-lrt (pseudo bulk, models batches)" # DESeq2 LRT test on pseudobulk aggregated gene expression - default: wilcoxon - label: "Test type to use in differential expression analysis" - doc: | - Test type to use in the differential - expression analysis. If set to deseq - or deseq-lrt, gene expression will be - aggregated to the pseudobulk form per - sample. Othwerwise, analysis will be - run on the cells level. If deseq is - selected, the pair-wise Wald test will - be used. For deseq-lrt, the Likelihood - Ratio Test will be applied between - design and reduced formulas. The - reduced formula will look like ~1 if - grouping by batches is omitted or will - be set to the category defined as - batches. + - "wilcoxon" # (wilcox) Wilcoxon Rank Sum test + - "likelihood-ratio" # (bimod) Likelihood-ratio test + - "t-test" # (t) Student's t-test + - "negative-binomial (batch correction)" # (negbinom) Negative Binomial Generalized Linear Model (supports --batchby) + - "poisson (batch correction)" # (poisson) Poisson Generalized Linear Model (supports --batchby) + - "logistic-regression (batch correction)" # (LR) Logistic Regression (supports --batchby) + - "mast (batch correction)" # (MAST) MAST package (supports --batchby) + - "deseq (pseudo bulk, batch correction)" # DESeq2 Wald test on pseudobulk aggregated gene expression + - "deseq-lrt (pseudo bulk, batch correction)" # DESeq2 LRT test on pseudobulk aggregated gene expression + default: "wilcoxon" + label: "Statistical test" + doc: | + Statistical test to use in the + differential expression analysis. If + set to "deseq" or "deseq-lrt", gene + expression will be aggregated to the + pseudo bulk form per dataset. Othwerwise, + analysis will be run on the cells level. + If "deseq" is selected, the pair-wise + Wald test will be used. For "deseq-lrt", + the Likelihood Ratio Test will be applied + between design and reduced formulas. The + reduced formula will look like "~1" if + grouping by batches is omitted or will be + set to the category defined in "Batch + correction (if supported)" input. batchby: type: string? default: null - label: "Category to model batch effect" - doc: | - If selected test type supports batch - effect modeling, the provided category - will be used to group cells into - batches. For deseq and deseq-lrt tests - batch modeling will result in adding it - into the design formula. For negative- - binomial, poisson, logistic-regression, - or mast tests grouping by batches will - be used as a latent variable in the - FindMarkers function. - - maximum_padj: - type: float? - default: 0.05 - label: "Maximum adjusted P-value for genes displayed on the heatmap" - doc: | - When generating gene expression heatmap - per cell output only differentially - expressed genes with the adjusted P-value - not bigger than this value. - - genes_of_interest: - type: string? - default: null - label: "Genes of interest to be shown on the plots" - doc: | - Genes of interest to be shown on the - volcano, violin, and UMAP plots. - 'sd:layout': - advanced: true + label: "Batch correction (if supported)" + doc: | + Value from the single cell metadata + column to be used for batch effect + modeling if the selected "Statistical + test" supports it (otherwise the + workflow will exit with error). For + "deseq" and "deseq-lrt" tests batch + modeling will result in adding it + into the design formula. For + negative-binomial, poisson, + logistic-regression, or mast tests + grouping by batches will be used as + a latent variable in the "FindMarkers" + function. exclude_pattern: type: string? default: null - label: "Regex pattern to identify and exclude specific genes from the analysis" + label: "Exclude genes" doc: | Regex pattern to identify and exclude specific genes from the differential expression analysis (not case-sensitive). - If any of such genes were selected as - genes of interest to be shown on the plots, - they will be excluded from there as well. - 'sd:layout': - advanced: true + If any of these genes were also provided + in "Genes of interest" input, they will + be excluded from that list as well. - cluster_method: - type: - - "null" - - type: enum - symbols: - - "row" - - "column" - - "both" - - "none" - default: "row" - label: "Clustering method for gene expression data" - doc: | - Clustering method to be run on - the normalized read counts data. - "column" and "both" options are - supported only when using deseq - or desey-lrt tests for which gene - expression data aggregated to the - pseudobulk form. - 'sd:layout': - advanced: true + maximum_padj: + type: float? + default: 0.05 + label: "Maximum adjusted P-value" + doc: | + Maximum adjusted P-value threshold for + selecting differentially expressed genes + to be visualized on the heatmap. - row_distance: - type: - - "null" - - type: enum - symbols: - - "cosangle" - - "abscosangle" - - "euclid" - - "abseuclid" - - "cor" - - "abscor" - default: "cosangle" - label: "Distance metric for row clustering" - doc: | - Distance metric for row clustering. - Ignored if clustering method is set - to "column" or "none". - 'sd:layout': - advanced: true + enable_clustering: + type: boolean? + default: false + label: "Cluster gene expression heatmap" + doc: | + Apply hierarchical (HOPACH) clustering + on the normalized read counts for the + exploratory visualization part of the + analysis. If the "Statistical test" + input is set to "deseq" or "deseq-lrt", + clustering will be performed for both + rows (genes) and columns (pseudo bulk + gene expression per dataset). For all + other statistical tests, clustering will + be performed only by rows (genes). + Default: clustering not enabled - column_distance: - type: - - "null" - - type: enum - symbols: - - "cosangle" - - "abscosangle" - - "euclid" - - "abseuclid" - - "cor" - - "abscor" - default: "euclid" - label: "Distance metric for column clustering" - doc: | - Distance metric for column clustering. - Ignored if clustering method is set - to "row" or "none". - 'sd:layout': - advanced: true + genes_of_interest: + type: string? + default: null + label: "Genes of interest" + doc: | + Comma or space separated list of genes + of interest to visualize expression + on the generated volcano, violin, and + UMAP plots. + Default: None - center_row: - type: boolean? - default: true - label: "Gene expression mean centering for clustering by row" - doc: | - Apply mean centering for gene - expression prior to running - clustering by row. Ignored if - clustering method is set to - "column" or "none". - 'sd:layout': - advanced: true + datasets_metadata: + type: File? + label: "Datasets metadata (optional)" + doc: | + If the selected single-cell analysis + includes multiple aggregated datasets, + each of them can be assigned to a + separate group by one or multiple + categories. This can be achieved by + providing a TSV/CSV file with + "library_id" as the first column and + any number of additional columns with + unique names, representing the desired + grouping categories. + + barcodes_data: + type: File? + label: "Selected cell barcodes (optional)" + doc: | + A TSV/CSV file to optionally prefilter + the single cell data by including only + the cells with the selected barcodes. + The provided file should include at + least one column named "barcode", with + one cell barcode per line. All other + columns, except for "barcode", will be + added to the single cell metadata loaded + from "Single-cell Analysis with Clustered + RNA-Seq Datasets" and can be utilized in + the current or future steps of analysis. color_theme: type: @@ -292,39 +254,12 @@ inputs: - "classic" - "void" default: "classic" - label: "Color theme" + label: "Plots color theme" doc: | - Color theme for all generated plots. - 'sd:layout': - advanced: true - - parallel_memory_limit: - type: - - "null" - - type: enum - symbols: - - "32" - default: "32" - label: "Maximum shared memory in GB" - doc: | - Maximum memory in GB allowed to - be shared between the workers - when using multiple CPUs. - 'sd:layout': - advanced: true - - vector_memory_limit: - type: - - "null" - - type: enum - symbols: - - "64" - default: "64" - label: "Maximum vector memory in GB" - doc: | - Maximum vector memory in GB - allowed to be used by R. - 'sd:layout': + Color theme for all plots saved + as PNG files. + Default: classic + "sd:layout": advanced: true threads: @@ -333,66 +268,39 @@ inputs: - type: enum symbols: - "1" + - "2" default: "1" - label: "Number of cores/cpus" + label: "Cores/CPUs" doc: | - Number of cores/cpus to use - 'sd:layout': + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 1 + "sd:layout": advanced: true outputs: - umap_rd_rnaumap_plot_png: - type: File? - outputSource: de_pseudobulk/umap_rd_rnaumap_plot_png - label: "Cells RNA UMAP split by selected criteria" - doc: | - Cells UMAP split by selected criteria, - optionally subsetted to the specific - group (rnaumap dim. reduction). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Overall' - Caption: 'Cells RNA UMAP split by selected criteria' - - umap_rd_atacumap_plot_png: - type: File? - outputSource: de_pseudobulk/umap_rd_atacumap_plot_png - label: "Cells ATAC UMAP split by selected criteria" - doc: | - Cells UMAP split by selected criteria, - optionally subsetted to the specific - group (atacumap dim. reduction). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Overall' - Caption: 'Cells ATAC UMAP split by selected criteria' - - umap_rd_wnnumap_plot_png: - type: File? - outputSource: de_pseudobulk/umap_rd_wnnumap_plot_png - label: "Cells WNN UMAP split by selected criteria" - doc: | - Cells UMAP split by selected criteria, - optionally subsetted to the specific - group (wnnumap dim. reduction). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Overall' - Caption: 'Cells WNN UMAP split by selected criteria' - mds_plot_html: type: File? outputSource: de_pseudobulk/mds_plot_html - label: "Interactive MDS Plot" + label: "MDS Plot" doc: | MDS plot of pseudobulk aggregated - normalized reads counts. All genes. - HTML format + not filtered normalized reads counts + in HTML format + 'sd:visualPlugins': + - linkList: + tab: 'Overview' + target: "_blank" + + heatmap_html: + type: File + outputSource: morpheus_heatmap/heatmap_html + label: "Heatmap" + doc: | + Morpheus heatmap in HTML format 'sd:visualPlugins': - linkList: tab: 'Overview' @@ -401,7 +309,7 @@ outputs: volcano_plot_html_file: type: File outputSource: make_volcano_plot/html_file - label: "Interactive Volcano Plot" + label: "Volcano Plot" doc: | HTML index file for Volcano Plot 'sd:visualPlugins': @@ -412,78 +320,124 @@ outputs: volcano_plot_html_data: type: Directory outputSource: make_volcano_plot/html_data - label: "Directory html data for Volcano Plot" + label: "Volcano Plot data" doc: | Directory html data for Volcano Plot - heatmap_html: - type: File - outputSource: morpheus_heatmap/heatmap_html - label: "Interactive Gene Expression Heatmap" - doc: | - Morpheus heatmap in HTML format - 'sd:visualPlugins': - - linkList: - tab: 'Overview' - target: "_blank" - pca_1_2_plot_png: type: File? outputSource: de_pseudobulk/pca_1_2_plot_png - label: "Normalized reads counts PCA (1, 2). All genes." + label: "Gene expression PCA (1,2)" doc: | - Normalized reads counts PCA (1, 2). All genes. - PNG format + Gene expression PCA (1,2) + in PNG format 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Normalized reads counts PCA (1, 2). All genes' + tab: 'QC' + Caption: 'Gene expression PCA (1,2)' pca_2_3_plot_png: type: File? outputSource: de_pseudobulk/pca_2_3_plot_png - label: "Normalized reads counts PCA (2, 3). All genes." + label: "Gene expression PCA (2,3)" + doc: | + Gene expression PCA (2,3) + in PNG format + 'sd:visualPlugins': + - image: + tab: 'QC' + Caption: 'Gene expression PCA (2,3)' + + umap_rd_rnaumap_plot_png: + type: File? + outputSource: de_pseudobulk/umap_rd_rnaumap_plot_png + label: "UMAP, split by comparison category, RNA" + doc: | + UMAP, split by the single cell metadata + column defined in the "Comparison category", + optionally subsetted to include only cells + with "Subsetting values (optional)" from + the "Subsetting category (optional)", RNA + PNG format + 'sd:visualPlugins': + - image: + tab: 'QC' + Caption: 'UMAP, split by comparison category, RNA' + + umap_rd_atacumap_plot_png: + type: File? + outputSource: de_pseudobulk/umap_rd_atacumap_plot_png + label: "UMAP, split by comparison category, ATAC" + doc: | + UMAP, split by the single cell metadata + column defined in the "Comparison category", + optionally subsetted to include only cells + with "Subsetting values (optional)" from + the "Subsetting category (optional)", ATAC + PNG format + 'sd:visualPlugins': + - image: + tab: 'QC' + Caption: 'UMAP, split by comparison category, ATAC' + + umap_rd_wnnumap_plot_png: + type: File? + outputSource: de_pseudobulk/umap_rd_wnnumap_plot_png + label: "UMAP, split by comparison category, WNN" doc: | - Normalized reads counts PCA (2, 3). All genes. + UMAP, split by the single cell metadata + column defined in the "Comparison category", + optionally subsetted to include only cells + with "Subsetting values (optional)" from + the "Subsetting category (optional)", WNN PNG format 'sd:visualPlugins': - image: - tab: 'Overall' - Caption: 'Normalized reads counts PCA (2, 3). All genes' + tab: 'QC' + Caption: 'UMAP, split by comparison category, WNN' dxpr_vlcn_plot_png: type: File? outputSource: de_pseudobulk/dxpr_vlcn_plot_png - label: "Volcano plot of differentially expressed genes" - doc: | - Volcano plot of differentially expressed genes. - Highlighed genes are either provided by user or - top 10 genes with the highest log2FoldChange - values. The direction of comparison is defined - as --second vs --first. Cells are optionally - subsetted to the specific group and optionally - coerced to the pseudobulk form. - PNG format + label: "Gene expression volcano plot" + doc: | + Volcano plot of differentially expressed + genes. Highlighed genes are either provided + by user or top 10 genes with the highest + log2FoldChange values. PNG format 'sd:visualPlugins': - image: - tab: 'Gene expression' + tab: 'Genes of interest' Caption: 'Volcano plot of differentially expressed genes' xpr_dnst_plot_png: type: File? outputSource: de_pseudobulk/xpr_dnst_plot_png - label: "Log normalized gene expression density plots" + label: "Gene expression violin plot" doc: | - Log normalized gene expression density plots for - either user provided or top 10 differentially - expressed genes with the highest log2FoldChange - values. The direction of comparison is defined - as --second vs --first. - PNG format + Gene expression violin plots for + either user provided or top 10 + differentially expressed genes + with the highest log2FoldChange + values in PNG format + 'sd:visualPlugins': + - image: + tab: 'Genes of interest' + Caption: 'Gene expression violin plot' + + xpr_htmp_plot_png: + type: File? + outputSource: de_pseudobulk/xpr_htmp_plot_png + label: "Gene expression heatmap" + doc: | + Gene expression heatmap, filtered + by adjusted P-value, optionally + subsetted to the specific groups + of cells in PNG format 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression density plots' + tab: 'Heatmap' + Caption: 'Gene expression heatmap' xpr_per_cell_rd_rnaumap_plot_png: type: @@ -491,16 +445,15 @@ outputs: - type: array items: File outputSource: de_pseudobulk/xpr_per_cell_rd_rnaumap_plot_png - label: "Log normalized gene expression on cells RNA UMAP" + label: "UMAP, gene expression, RNA" doc: | - Log normalized gene expression on cells UMAP - split by selected criteria, optionally subsetted - to the specific group (rnaumap dim. reduction). - PNG format + UMAP, gene expression, split by selected + criteria, optionally subsetted to the + specific group, RNA, PNG format 'sd:visualPlugins': - image: - tab: 'Gene expression RNA' - Caption: 'Log normalized gene expression on cells RNA UMAP' + tab: 'Gene expression, RNA' + Caption: 'UMAP, gene expression, RNA' xpr_per_cell_rd_atacumap_plot_png: type: @@ -508,16 +461,15 @@ outputs: - type: array items: File outputSource: de_pseudobulk/xpr_per_cell_rd_atacumap_plot_png - label: "Log normalized gene expression on cells ATAC UMAP" + label: "UMAP, gene expression, ATAC" doc: | - Log normalized gene expression on cells UMAP - split by selected criteria, optionally subsetted - to the specific group (atacumap dim. reduction). - PNG format + UMAP, gene expression, split by selected + criteria, optionally subsetted to the + specific group, ATAC, PNG format 'sd:visualPlugins': - image: - tab: 'Gene expression ATAC' - Caption: 'Log normalized gene expression on cells ATAC UMAP' + tab: 'Gene expression, ATAC' + Caption: 'UMAP, gene expression, ATAC' xpr_per_cell_rd_wnnumap_plot_png: type: @@ -525,69 +477,53 @@ outputs: - type: array items: File outputSource: de_pseudobulk/xpr_per_cell_rd_wnnumap_plot_png - label: "Log normalized gene expression on cells WNN UMAP" - doc: | - Log normalized gene expression on cells UMAP - split by selected criteria, optionally subsetted - to the specific group (wnnumap dim. reduction). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Gene expression WNN' - Caption: 'Log normalized gene expression on cells WNN UMAP' - - xpr_htmp_plot_png: - type: File? - outputSource: de_pseudobulk/xpr_htmp_plot_png - label: "Filtered by adjusted P-value normalized gene expression heatmap" + label: "UMAP, gene expression, WNN" doc: | - Filtered by adjusted P-value normalized gene - expression heatmap per cell optionally subsetted - to the specific group. - PNG format + UMAP, gene expression, split by selected + criteria, optionally subsetted to the + specific group, WNN, PNG format 'sd:visualPlugins': - image: - tab: 'Gene expression' - Caption: 'Filtered by adjusted P-value normalized gene expression heatmap' + tab: 'Gene expression, WNN' + Caption: 'UMAP, gene expression, WNN' diff_expr_genes: type: File outputSource: de_pseudobulk/diff_expr_genes - label: "Differentially expressed genes. Not filtered" + label: "Differentially expressed genes" doc: | - Differentially expressed genes. Not filtered - by adjusted P-value. + Not filtered by adjusted P-value + differentially expressed genes in TSV format 'sd:visualPlugins': - syncfusiongrid: - tab: 'Diff expressed genes' - Title: 'Differentially expressed genes. Not filtered' + tab: 'Diff. expressed genes' + Title: 'Differentially expressed genes' read_counts_file: type: File? outputSource: de_pseudobulk/bulk_read_counts_gct - label: "GSEA compatible not filtered normalized reads counts" + label: "GSEA compatible reads counts" doc: | - GSEA compatible not filtered normalized reads - counts aggregated to pseudobulk form. - GCT format + GSEA compatible not filtered normalized + reads counts aggregated to pseudobulk + form in GCT format phenotypes_file: type: File? outputSource: de_pseudobulk/bulk_phenotypes_cls - label: "GSEA compatible phenotypes file" + label: "GSEA compatible phenotypes" doc: | - GSEA compatible phenotypes file defined based - on --splitby, --first, and --second parameters. - CLS format + GSEA compatible phenotypes file + in CLS format cell_read_counts_gct: type: File outputSource: de_pseudobulk/cell_read_counts_gct - label: "Filtered normalized reads counts per cell" + label: "Morpheus compatible reads counts" doc: | - Filtered normalized reads counts per cell. - GCT format + Filtered normalized reads counts + per cell in GCT format pdf_plots: type: File @@ -634,11 +570,29 @@ steps: barcodes_data: barcodes_data groupby: source: groupby - valueFrom: $(self==""?null:self) # safety measure + valueFrom: | + ${ + if (self == "dataset") { + return "new.ident"; + } else if (self == "") { + return null; + } else { + return self; + } + } subset: source: subset valueFrom: $(split_features(self)) - splitby: splitby + splitby: + source: splitby + valueFrom: | + ${ + if (self == "dataset") { + return "new.ident"; + } else { + return self; + } + } first_cond: first_cond second_cond: second_cond analysis_method: @@ -655,22 +609,36 @@ steps: source: exclude_pattern valueFrom: $(self==""?null:self) # safety measure cluster_method: - source: cluster_method - valueFrom: $(self=="none"?null:self) - row_distance: row_distance - column_distance: column_distance - center_row: center_row + source: + - enable_clustering + - analysis_method + valueFrom: | + ${ + if (self[0]) { + if (self[1].includes("deseq")) { + return "both"; + } else { + return "row"; + } + } else { + return null; + } + } + row_distance: + default: "cosangle" + column_distance: + default: "euclid" + center_row: + default: true export_pdf_plots: default: true color_theme: color_theme verbose: default: true parallel_memory_limit: - source: parallel_memory_limit - valueFrom: $(parseInt(self)) + default: 32 vector_memory_limit: - source: vector_memory_limit - valueFrom: $(parseInt(self)) + default: 96 threads: source: threads valueFrom: $(parseInt(self)) @@ -678,7 +646,6 @@ steps: - umap_rd_rnaumap_plot_png - umap_rd_atacumap_plot_png - umap_rd_wnnumap_plot_png - - mds_plot_html - pca_1_2_plot_png - pca_2_3_plot_png - dxpr_vlcn_plot_png @@ -766,9 +733,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell Pseudobulk Differential Expression Analysis Between Datasets" -s:name: "Single-cell Pseudobulk Differential Expression Analysis Between Datasets" -s:alternateName: "Identifies differentially expressed genes between groups of cells coerced to pseudobulk datasets" +label: "Single-cell RNA-Seq Differential Expression Analysis" +s:name: "Single-cell RNA-Seq Differential Expression Analysis" +s:alternateName: "Identifies differentially expressed genes between groups of cells optionally coerced to the pseudobulk form" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-de-pseudobulk.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -806,7 +773,8 @@ s:creator: doc: | - Single-cell Pseudobulk Differential Expression Analysis Between Datasets + Single-cell RNA-Seq Differential Expression Analysis - Identifies differentially expressed genes between groups of cells - coerced to pseudobulk datasets. \ No newline at end of file + Identifies differentially expressed genes + between groups of cells optionally coerced + to the pseudobulk form. \ No newline at end of file From 2807ad3623b7c3c57e8be2f4c461cca3d737c903 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 4 Aug 2023 14:30:21 -0400 Subject: [PATCH 064/162] Add sc rna trajectory analysis --- tools/sc-atac-cluster.cwl | 2 +- tools/sc-atac-coverage.cwl | 2 +- tools/sc-atac-dbinding.cwl | 2 +- tools/sc-atac-reduce.cwl | 2 +- tools/sc-ctype-assign.cwl | 2 +- tools/sc-multiome-filter.cwl | 2 +- tools/sc-rna-cluster.cwl | 2 +- tools/sc-rna-da-cells.cwl | 2 +- tools/sc-rna-de-pseudobulk.cwl | 2 +- tools/sc-rna-filter.cwl | 2 +- tools/sc-rna-reduce.cwl | 2 +- tools/sc-rna-trajectory.cwl | 630 ++++++++++++++++++++++++++++++++ tools/sc-triangulate.cwl | 2 +- tools/sc-wnn-cluster.cwl | 2 +- workflows/sc-rna-trajectory.cwl | 532 +++++++++++++++++++++++++++ 15 files changed, 1175 insertions(+), 13 deletions(-) create mode 100644 tools/sc-rna-trajectory.cwl create mode 100644 workflows/sc-rna-trajectory.cwl diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index 180c28ff..3055ee63 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.26 + dockerPull: biowardrobe2/sc-tools:v0.0.27 inputs: diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index 42a919f9..e18dd718 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.26 + dockerPull: biowardrobe2/sc-tools:v0.0.27 inputs: diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index 7698b2c0..fc4281e7 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.26 + dockerPull: biowardrobe2/sc-tools:v0.0.27 inputs: diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index 144c8e8a..6b80e04b 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.26 + dockerPull: biowardrobe2/sc-tools:v0.0.27 inputs: diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index acda4c3b..845a6d46 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.26 + dockerPull: biowardrobe2/sc-tools:v0.0.27 inputs: diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index 98926a6b..7b439a78 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.26 + dockerPull: biowardrobe2/sc-tools:v0.0.27 inputs: diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index 19ccc39f..07acc149 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.26 + dockerPull: biowardrobe2/sc-tools:v0.0.27 inputs: diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index f543902e..23203919 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.26 + dockerPull: biowardrobe2/sc-tools:v0.0.27 inputs: diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index df4f9936..053247e2 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.26 + dockerPull: biowardrobe2/sc-tools:v0.0.27 inputs: diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index 37a04614..adc0eaf4 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.26 + dockerPull: biowardrobe2/sc-tools:v0.0.27 inputs: diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index ec10de08..d19478e9 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.26 + dockerPull: biowardrobe2/sc-tools:v0.0.27 inputs: diff --git a/tools/sc-rna-trajectory.cwl b/tools/sc-rna-trajectory.cwl new file mode 100644 index 00000000..691c375b --- /dev/null +++ b/tools/sc-rna-trajectory.cwl @@ -0,0 +1,630 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement +- class: EnvVarRequirement + envDef: + R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/sc-tools:v0.0.27 + + +inputs: + + query_data_rds: + type: File + inputBinding: + prefix: "--query" + doc: | + Path to the RDS file to load Seurat object from. This file should + include genes expression information stored in the RNA assay and + dimensionality reduction specified in the --reduction parameter. + + reduction: + type: string? + inputBinding: + prefix: "--reduction" + doc: | + Dimensionality reduction to be used in the trajectory analysis. + Default: pca + + dimensions: + type: + - "null" + - int + - int[] + inputBinding: + prefix: "--dimensions" + doc: | + Dimensionality to use (from 1 to 50). If single value N is provided, + use from 1 to N dimensions. If multiple values are provided, subset + to only selected dimensions. May fail if user specified more dimensions + than it was available in the selected --reduction. + Default: use all available dimensions + + query_source_column: + type: string + inputBinding: + prefix: "--source" + doc: | + Column from the metadata of the loaded + Seurat object to select clusters from + + barcodes_data: + type: File? + inputBinding: + prefix: "--barcodes" + doc: | + Path to the TSV/CSV file to optionally prefilter and extend Seurat object + metadata be selected barcodes. First column should be named as 'barcode'. + If file includes any other columns they will be added to the Seurat object + metadata ovewriting the existing ones if those are present. + Default: all cells used, no extra metadata is added + + trajectory_start: + type: string? + inputBinding: + prefix: "--start" + doc: | + Value from the metadata column defined with --source + parameter to set the starting point for the trajectory. + Default: defined automatically + + predictive_genes: + type: int? + inputBinding: + prefix: "--ngenes" + doc: | + Number of the most predictive genes to be shows + on the gene expression heatmap. Default: 50 + + export_pdf_plots: + type: boolean? + inputBinding: + prefix: "--pdf" + doc: | + Export plots in PDF. + Default: false + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + inputBinding: + prefix: "--theme" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + + verbose: + type: boolean? + inputBinding: + prefix: "--verbose" + doc: | + Print debug information. + Default: false + + export_h5seurat_data: + type: boolean? + inputBinding: + prefix: "--h5seurat" + doc: | + Save Seurat data to h5seurat file. + Default: false + + export_h5ad_data: + type: boolean? + inputBinding: + prefix: "--h5ad" + doc: | + Save Seurat data to h5ad file. + Default: false + + export_ucsc_cb: + type: boolean? + inputBinding: + prefix: "--cbbuild" + doc: | + Export results to UCSC Cell Browser. Default: false + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix. + Default: ./sc + + parallel_memory_limit: + type: int? + inputBinding: + prefix: "--memory" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Default: 32 + + vector_memory_limit: + type: int? + default: 128 + doc: | + Maximum vector memory in GB allowed to be used by R. + Default: 128 + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. + Default: 1 + + +outputs: + + trjc_gr_clst_plot_png: + type: File? + outputBinding: + glob: "*_trjc_gr_clst.png" + doc: | + Trajectory plot, colored by cluster. + PNG format + + trjc_gr_clst_plot_pdf: + type: File? + outputBinding: + glob: "*_trjc_gr_clst.pdf" + doc: | + Trajectory plot, colored by cluster. + PDF format + + trjc_pstm_plot_png: + type: File? + outputBinding: + glob: "*_trjc_pstm.png" + doc: | + Trajectory plot, colored by pseudotime. + PNG format + + trjc_pstm_plot_pdf: + type: File? + outputBinding: + glob: "*_trjc_pstm.pdf" + doc: | + Trajectory plot, colored by pseudotime. + PDF format + + grph_gr_clst_plot_png: + type: File? + outputBinding: + glob: "*_grph_gr_clst.png" + doc: | + Trajectory graph, colored by cluster. + PNG format + + grph_gr_clst_plot_pdf: + type: File? + outputBinding: + glob: "*_grph_gr_clst.pdf" + doc: | + Trajectory graph, colored by cluster. + PDF format + + grph_pstm_plot_png: + type: File? + outputBinding: + glob: "*_grph_pstm.png" + doc: | + Trajectory graph, colored by pseudotime. + PNG format + + grph_pstm_plot_pdf: + type: File? + outputBinding: + glob: "*_grph_pstm.pdf" + doc: | + Trajectory graph, colored by pseudotime. + PDF format + + dndr_gr_clst_plot_png: + type: File? + outputBinding: + glob: "*_dndr_gr_clst.png" + doc: | + Dendrogram plot, colored by cluster. + PNG format + + dndr_gr_clst_plot_pdf: + type: File? + outputBinding: + glob: "*_dndr_gr_clst.pdf" + doc: | + Dendrogram plot, colored by cluster. + PDF format + + dndr_pstm_plot_png: + type: File? + outputBinding: + glob: "*_dndr_pstm.png" + doc: | + Dendrogram plot, colored by pseudotime. + PNG format + + dndr_pstm_plot_pdf: + type: File? + outputBinding: + glob: "*_dndr_pstm.pdf" + doc: | + Dendrogram plot, colored by pseudotime. + PDF format + + tplg_plot_png: + type: File? + outputBinding: + glob: "*_tplg.png" + doc: | + Topology plot. + PNG format + + tplg_plot_pdf: + type: File? + outputBinding: + glob: "*_tplg.pdf" + doc: | + Topology plot. + PDF format + + xpr_htmp_plot_png: + type: File? + outputBinding: + glob: "*_xpr_htmp.png" + doc: | + Gene expression heatmap. + PNG format + + xpr_htmp_plot_pdf: + type: File? + outputBinding: + glob: "*_xpr_htmp.pdf" + doc: | + Gene expression heatmap. + PDF format + + umap_rd_rnaumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_rd_rnaumap.png" + doc: | + UMAP, colored by pseudotime, RNA. + PNG format + + umap_rd_rnaumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_rd_rnaumap.pdf" + doc: | + UMAP, colored by pseudotime, RNA. + PDF format + + umap_rd_atacumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_rd_atacumap.png" + doc: | + UMAP, colored by pseudotime, ATAC. + PNG format + + umap_rd_atacumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_rd_atacumap.pdf" + doc: | + UMAP, colored by pseudotime, ATAC. + PDF format + + umap_rd_wnnumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_rd_wnnumap.png" + doc: | + UMAP, colored by pseudotime, WNN. + PNG format + + umap_rd_wnnumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_rd_wnnumap.pdf" + doc: | + UMAP, colored by pseudotime, WNN. + PDF format + + umap_spl_idnt_rd_rnaumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_idnt_rd_rnaumap.png" + doc: | + UMAP, colored by pseudotime, + split by dataset, RNA. + PNG format + + umap_spl_idnt_rd_rnaumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_idnt_rd_rnaumap.pdf" + doc: | + UMAP, colored by pseudotime, + split by dataset, RNA. + PDF format + + umap_spl_idnt_rd_atacumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_idnt_rd_atacumap.png" + doc: | + UMAP, colored by pseudotime, + split by dataset, ATAC. + PNG format + + umap_spl_idnt_rd_atacumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_idnt_rd_atacumap.pdf" + doc: | + UMAP, colored by pseudotime, + split by dataset, ATAC. + PDF format + + umap_spl_idnt_rd_wnnumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_idnt_rd_wnnumap.png" + doc: | + UMAP, colored by pseudotime, + split by dataset, WNN. + PNG format + + umap_spl_idnt_rd_wnnumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_idnt_rd_wnnumap.pdf" + doc: | + UMAP, colored by pseudotime, + split by dataset, WNN. + PDF format + + umap_spl_cnd_rd_rnaumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_cnd_rd_rnaumap.png" + doc: | + UMAP, colored by pseudotime, + split by grouping condition, RNA. + PNG format + + umap_spl_cnd_rd_rnaumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_cnd_rd_rnaumap.pdf" + doc: | + UMAP, colored by pseudotime, + split by grouping condition, RNA. + PDF format + + umap_spl_cnd_rd_atacumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_cnd_rd_atacumap.png" + doc: | + UMAP, colored by pseudotime, + split by grouping condition, ATAC. + PNG format + + umap_spl_cnd_rd_atacumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_cnd_rd_atacumap.pdf" + doc: | + UMAP, colored by pseudotime, + split by grouping condition, ATAC. + PDF format + + umap_spl_cnd_rd_wnnumap_plot_png: + type: File? + outputBinding: + glob: "*_umap_spl_cnd_rd_wnnumap.png" + doc: | + UMAP, colored by pseudotime, + split by grouping condition, WNN. + PNG format + + umap_spl_cnd_rd_wnnumap_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_spl_cnd_rd_wnnumap.pdf" + doc: | + UMAP, colored by pseudotime, + split by grouping condition, WNN. + PDF format + + ucsc_cb_config_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser" + doc: | + Directory with UCSC Cellbrowser + configuration data. + + ucsc_cb_html_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser/html_data" + doc: | + Directory with UCSC Cellbrowser + html data. + + ucsc_cb_html_file: + type: File? + outputBinding: + glob: "*_cellbrowser/html_data/index.html" + doc: | + HTML index file from the directory + with UCSC Cellbrowser html data. + + seurat_data_rds: + type: File + outputBinding: + glob: "*_data.rds" + doc: | + Reduced Seurat data in RDS format + + seurat_data_h5seurat: + type: File? + outputBinding: + glob: "*_data.h5seurat" + doc: | + Reduced Seurat data in h5seurat format + + seurat_data_h5ad: + type: File? + outputBinding: + glob: "*_data.h5ad" + doc: | + Reduced Seurat data in h5ad format + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["sc_rna_trajectory.R"] + +stdout: sc_rna_trajectory_stdout.log +stderr: sc_rna_trajectory_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Single-cell RNA-Seq Trajectory Analysis" +s:name: "Single-cell RNA-Seq Trajectory Analysis" +s:alternateName: "Aligns cells along the trajectory defined based on PCA or other dimensionality reduction" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-rna-trajectory.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell RNA-Seq Trajectory Analysis + + Aligns cells along the trajectory defined + based on PCA or other dimensionality reduction + + +s:about: | + usage: sc_rna_trajectory.R [-h] --query QUERY + [--reduction REDUCTION] + [--dimensions [DIMENSIONS [DIMENSIONS ...]]] + --source SOURCE + [--barcodes BARCODES] + [--start START] [--ngenes NGENES] + [--pdf] [--verbose] [--h5seurat] + [--h5ad] [--cbbuild] + [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + + Single-cell RNA-Seq Trajectory Analysis + + optional arguments: + -h, --help show this help message and exit + --query QUERY Path to the RDS file to load Seurat object from. This + file should include genes expression information + stored in the RNA assay and dimensionality reduction + specified in the --reduction parameter. + --reduction REDUCTION + Dimensionality reduction to be used in the trajectory + analysis. Default: pca + --dimensions [DIMENSIONS [DIMENSIONS ...]] + Dimensionality to use (from 1 to 50). If single value + N is provided, use from 1 to N dimensions. If multiple + values are provided, subset to only selected + dimensions. May fail if user specified more dimensions + than it was available in the selected --reduction. + Default: use all available dimensions + --source SOURCE Column from the metadata of the loaded Seurat object + to select clusters from + --barcodes BARCODES Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata be selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. Default: all cells used, no extra + metadata is added + --start START Value from the metadata column defined with --source + parameter to set the starting point for the + trajectory. Default: defined automatically + --ngenes NGENES Number of the most predictive genes to be shows on the + gene expression heatmap. Default: 50 + --pdf Export plots in PDF. Default: false + --verbose Print debug information. Default: false + --h5seurat Save Seurat data to h5seurat file. Default: false + --h5ad Save Seurat data to h5ad file. Default: false + --cbbuild Export results to UCSC Cell Browser. Default: false + --output OUTPUT Output prefix. Default: ./sc + --theme {gray,bw,linedraw,light,dark,minimal,classic,void} + Color theme for all generated plots. Default: classic + --cpus CPUS Number of cores/cpus to use. Default: 1 + --memory MEMORY Maximum memory in GB allowed to be shared between the + workers when using multiple --cpus. Default: 32 \ No newline at end of file diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index ef50b511..8e58dfde 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.26 + dockerPull: biowardrobe2/sc-tools:v0.0.27 inputs: diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index 6ce53030..2d1f48c6 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.26 + dockerPull: biowardrobe2/sc-tools:v0.0.27 inputs: diff --git a/workflows/sc-rna-trajectory.cwl b/workflows/sc-rna-trajectory.cwl new file mode 100644 index 00000000..5af30827 --- /dev/null +++ b/workflows/sc-rna-trajectory.cwl @@ -0,0 +1,532 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + + +'sd:upstream': + sc_tools_sample: + - "sc-rna-cluster.cwl" + - "sc-atac-cluster.cwl" + - "sc-wnn-cluster.cwl" + - "sc-ctype-assign.cwl" + - "https://github.com/datirium/workflows/workflows/sc-rna-cluster.cwl" + - "https://github.com/datirium/workflows/workflows/sc-atac-cluster.cwl" + - "https://github.com/datirium/workflows/workflows/sc-wnn-cluster.cwl" + - "https://github.com/datirium/workflows/workflows/sc-ctype-assign.cwl" + + +inputs: + + alias: + type: string + label: "Analysis name" + sd:preview: + position: 1 + + query_data_rds: + type: File + label: "Single-cell Analysis with Clustered RNA-Seq Datasets" + doc: | + Analysis that includes single-cell + multiome RNA and ATAC-Seq or just + RNA-Seq datasets run through either + "Single-cell Manual Cell Type + Assignment" (based on the RNA or WNN + clustering results), "Single-cell + RNA-Seq Cluster Analysis", or + "Single-cell WNN Cluster Analysis" + at any of the processing stages. + 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" + 'sd:localLabel': true + + query_source_column: + type: string + label: "Cells grouping" + doc: | + Single cell metadata column to group + cells into clusters. Usually, in a form + of "[rna|atac|wsnn]_res.X", where X is + the clustering resolution. If cell types + are available, add "custom_" prefix to + the column name. + + trajectory_start: + type: string? + label: "Trajectory start (optional)" + doc: | + Value from the single cell metadata + column used for grouping cells into + the clusters. This value will define + the trajectory starting point. + + barcodes_data: + type: File? + label: "Selected cell barcodes (optional)" + doc: | + A TSV/CSV file to optionally prefilter + the single cell data by including only + the cells with the selected barcodes. + The provided file should include at + least one column named "barcode", with + one cell barcode per line. All other + columns, except for "barcode", will be + added to the single cell metadata loaded + from "Single-cell Analysis with Clustered + RNA-Seq Datasets" and can be utilized in + the current or future steps of analysis. + + export_ucsc_cb: + type: boolean? + default: false + label: "Show results in UCSC Cell Browser" + doc: | + Export results into UCSC Cell Browser + Default: false + 'sd:layout': + advanced: true + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + default: "classic" + label: "Plots color theme" + doc: | + Color theme for all plots saved + as PNG files. + Default: classic + "sd:layout": + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + - "2" + default: "1" + label: "Cores/CPUs" + doc: | + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 1 + "sd:layout": + advanced: true + + +outputs: + + trjc_gr_clst_plot_png: + type: File? + outputSource: rna_trajectory/trjc_gr_clst_plot_png + label: "Trajectory plot, colored by cluster" + doc: | + Trajectory plot, colored by cluster + 'sd:visualPlugins': + - image: + tab: 'Trajectory' + Caption: 'Trajectory plot, colored by cluster' + + trjc_pstm_plot_png: + type: File? + outputSource: rna_trajectory/trjc_pstm_plot_png + label: "Trajectory plot, colored by pseudotime" + doc: | + Trajectory plot, colored by pseudotime + 'sd:visualPlugins': + - image: + tab: 'Trajectory' + Caption: 'Trajectory plot, colored by pseudotime' + + dndr_gr_clst_plot_png: + type: File? + outputSource: rna_trajectory/dndr_gr_clst_plot_png + label: "Dendrogram plot, colored by cluster" + doc: | + Dendrogram plot, colored by cluster + 'sd:visualPlugins': + - image: + tab: 'Trajectory' + Caption: 'Dendrogram plot, colored by cluster' + + dndr_pstm_plot_png: + type: File? + outputSource: rna_trajectory/dndr_pstm_plot_png + label: "Dendrogram plot, colored by pseudotime" + doc: | + Dendrogram plot, colored by pseudotime + 'sd:visualPlugins': + - image: + tab: 'Trajectory' + Caption: 'Dendrogram plot, colored by pseudotime' + + grph_gr_clst_plot_png: + type: File? + outputSource: rna_trajectory/grph_gr_clst_plot_png + label: "Trajectory graph, colored by cluster" + doc: | + Trajectory graph, colored by cluster + 'sd:visualPlugins': + - image: + tab: 'Topology' + Caption: 'Trajectory graph, colored by cluster' + + grph_pstm_plot_png: + type: File? + outputSource: rna_trajectory/grph_pstm_plot_png + label: "Trajectory graph, colored by pseudotime" + doc: | + Trajectory graph, colored by pseudotime + 'sd:visualPlugins': + - image: + tab: 'Topology' + Caption: 'Trajectory graph, colored by pseudotime' + + tplg_plot_png: + type: File? + outputSource: rna_trajectory/tplg_plot_png + label: "Topology plot" + doc: | + Topology plot + 'sd:visualPlugins': + - image: + tab: 'Topology' + Caption: 'Topology plot' + + xpr_htmp_plot_png: + type: File? + outputSource: rna_trajectory/xpr_htmp_plot_png + label: "Gene expression heatmap" + doc: | + Gene expression heatmap + 'sd:visualPlugins': + - image: + tab: 'Heatmap' + Caption: 'Gene expression heatmap' + + umap_rd_rnaumap_plot_png: + type: File? + outputSource: rna_trajectory/umap_rd_rnaumap_plot_png + label: "UMAP, colored by pseudotime, RNA" + doc: | + UMAP, colored by pseudotime, RNA + 'sd:visualPlugins': + - image: + tab: 'Pseudotime' + Caption: 'UMAP, colored by pseudotime, RNA' + + umap_rd_atacumap_plot_png: + type: File? + outputSource: rna_trajectory/umap_rd_atacumap_plot_png + label: "UMAP, colored by pseudotime, ATAC" + doc: | + UMAP, colored by pseudotime, ATAC + 'sd:visualPlugins': + - image: + tab: 'Pseudotime' + Caption: 'UMAP, colored by pseudotime, ATAC' + + umap_rd_wnnumap_plot_png: + type: File? + outputSource: rna_trajectory/umap_rd_wnnumap_plot_png + label: "UMAP, colored by pseudotime, WNN" + doc: | + UMAP, colored by pseudotime, WNN + 'sd:visualPlugins': + - image: + tab: 'Pseudotime' + Caption: 'UMAP, colored by pseudotime, WNN' + + umap_spl_idnt_rd_rnaumap_plot_png: + type: File? + outputSource: rna_trajectory/umap_spl_idnt_rd_rnaumap_plot_png + label: "UMAP, colored by pseudotime, split by dataset, RNA" + doc: | + UMAP, colored by pseudotime, + split by dataset, RNA + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'UMAP, colored by pseudotime, split by dataset, RNA' + + umap_spl_idnt_rd_atacumap_plot_png: + type: File? + outputSource: rna_trajectory/umap_spl_idnt_rd_atacumap_plot_png + label: "UMAP, colored by pseudotime, split by dataset, ATAC" + doc: | + UMAP, colored by pseudotime, + split by dataset, ATAC + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'UMAP, colored by pseudotime, split by dataset, ATAC' + + umap_spl_idnt_rd_wnnumap_plot_png: + type: File? + outputSource: rna_trajectory/umap_spl_idnt_rd_wnnumap_plot_png + label: "UMAP, colored by pseudotime, split by dataset, WNN" + doc: | + UMAP, colored by pseudotime, + split by dataset, WNN + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'UMAP, colored by pseudotime, split by dataset, WNN' + + umap_spl_cnd_rd_rnaumap_plot_png: + type: File? + outputSource: rna_trajectory/umap_spl_cnd_rd_rnaumap_plot_png + label: "UMAP, colored by pseudotime, split by grouping condition, RNA" + doc: | + UMAP, colored by pseudotime, + split by grouping condition, RNA + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'UMAP, colored by pseudotime, split by grouping condition, RNA' + + umap_spl_cnd_rd_atacumap_plot_png: + type: File? + outputSource: rna_trajectory/umap_spl_cnd_rd_atacumap_plot_png + label: "UMAP, colored by pseudotime, split by grouping condition, ATAC" + doc: | + UMAP, colored by pseudotime, + split by grouping condition, ATAC + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'UMAP, colored by pseudotime, split by grouping condition, ATAC' + + umap_spl_cnd_rd_wnnumap_plot_png: + type: File? + outputSource: rna_trajectory/umap_spl_cnd_rd_wnnumap_plot_png + label: "UMAP, colored by pseudotime, split by grouping condition, WNN" + doc: | + UMAP, colored by pseudotime, + split by grouping condition, WNN + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'UMAP, colored by pseudotime, split by grouping condition, WNN' + + ucsc_cb_html_data: + type: Directory? + outputSource: rna_trajectory/ucsc_cb_html_data + label: "UCSC Cell Browser data" + doc: | + Directory with UCSC Cell Browser + data + + ucsc_cb_html_file: + type: File? + outputSource: rna_trajectory/ucsc_cb_html_file + label: "UCSC Cell Browser" + doc: | + UCSC Cell Browser HTML index file + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + + seurat_data_rds: + type: File + outputSource: rna_trajectory/seurat_data_rds + label: "Processed Seurat data in RDS format" + doc: | + Processed Seurat data in RDS format + + pdf_plots: + type: File + outputSource: compress_pdf_plots/compressed_folder + label: "Plots in PDF format" + doc: | + Compressed folder with plots + in PDF format + + rna_trajectory_stdout_log: + type: File + outputSource: rna_trajectory/stdout_log + label: "stdout log generated by rna_trajectory step" + doc: | + stdout log generated by rna_trajectory step + + rna_trajectory_stderr_log: + type: File + outputSource: rna_trajectory/stderr_log + label: "stderr log generated by rna_trajectory step" + doc: | + stderr log generated by rna_trajectory step + + +steps: + + rna_trajectory: + run: ../tools/sc-rna-trajectory.cwl + in: + query_data_rds: query_data_rds + barcodes_data: barcodes_data + reduction: + default: "pca" + query_source_column: query_source_column + trajectory_start: + source: trajectory_start + valueFrom: $(self==""?null:self) # safety measure + predictive_genes: + default: 100 + verbose: + default: true + export_ucsc_cb: export_ucsc_cb + export_pdf_plots: + default: true + color_theme: color_theme + parallel_memory_limit: + default: 32 + vector_memory_limit: + default: 96 + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - trjc_gr_clst_plot_png + - trjc_gr_clst_plot_pdf + - trjc_pstm_plot_png + - trjc_pstm_plot_pdf + - grph_gr_clst_plot_png + - grph_gr_clst_plot_pdf + - grph_pstm_plot_png + - grph_pstm_plot_pdf + - dndr_gr_clst_plot_png + - dndr_gr_clst_plot_pdf + - dndr_pstm_plot_png + - dndr_pstm_plot_pdf + - tplg_plot_png + - tplg_plot_pdf + - xpr_htmp_plot_png + - xpr_htmp_plot_pdf + - umap_rd_rnaumap_plot_png + - umap_rd_rnaumap_plot_pdf + - umap_rd_atacumap_plot_png + - umap_rd_atacumap_plot_pdf + - umap_rd_wnnumap_plot_png + - umap_rd_wnnumap_plot_pdf + - umap_spl_idnt_rd_rnaumap_plot_png + - umap_spl_idnt_rd_rnaumap_plot_pdf + - umap_spl_idnt_rd_atacumap_plot_png + - umap_spl_idnt_rd_atacumap_plot_pdf + - umap_spl_idnt_rd_wnnumap_plot_png + - umap_spl_idnt_rd_wnnumap_plot_pdf + - umap_spl_cnd_rd_rnaumap_plot_png + - umap_spl_cnd_rd_rnaumap_plot_pdf + - umap_spl_cnd_rd_atacumap_plot_png + - umap_spl_cnd_rd_atacumap_plot_pdf + - umap_spl_cnd_rd_wnnumap_plot_png + - umap_spl_cnd_rd_wnnumap_plot_pdf + - ucsc_cb_html_data + - ucsc_cb_html_file + - seurat_data_rds + - stdout_log + - stderr_log + + pdf_plots: + run: ../tools/files-to-folder.cwl + in: + input_files: + source: + - rna_trajectory/trjc_gr_clst_plot_pdf + - rna_trajectory/trjc_pstm_plot_pdf + - rna_trajectory/grph_gr_clst_plot_pdf + - rna_trajectory/grph_pstm_plot_pdf + - rna_trajectory/dndr_gr_clst_plot_pdf + - rna_trajectory/dndr_pstm_plot_pdf + - rna_trajectory/tplg_plot_pdf + - rna_trajectory/xpr_htmp_plot_pdf + - rna_trajectory/umap_rd_rnaumap_plot_pdf + - rna_trajectory/umap_rd_atacumap_plot_pdf + - rna_trajectory/umap_rd_wnnumap_plot_pdf + - rna_trajectory/umap_spl_idnt_rd_rnaumap_plot_pdf + - rna_trajectory/umap_spl_idnt_rd_atacumap_plot_pdf + - rna_trajectory/umap_spl_idnt_rd_wnnumap_plot_pdf + - rna_trajectory/umap_spl_cnd_rd_rnaumap_plot_pdf + - rna_trajectory/umap_spl_cnd_rd_atacumap_plot_pdf + - rna_trajectory/umap_spl_cnd_rd_wnnumap_plot_pdf + valueFrom: $(self.flat().filter(n => n)) + folder_basename: + default: "pdf_plots" + out: + - folder + + compress_pdf_plots: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: pdf_plots/folder + out: + - compressed_folder + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Single-cell RNA-Seq Trajectory Analysis" +s:name: "Single-cell RNA-Seq Trajectory Analysis" +s:alternateName: "Aligns cells along the trajectory defined based on PCA or other dimensionality reduction" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-trajectory.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell RNA-Seq Trajectory Analysis + + Aligns cells along the trajectory defined based + on PCA or other dimensionality reduction \ No newline at end of file From 8faeb368b9515d76ea6b04607dbbc431f7fca042 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 17 Aug 2023 17:34:14 -0400 Subject: [PATCH 065/162] Add sc vdj profile workflow, update docker image to the latest --- tools/sc-atac-cluster.cwl | 2 +- tools/sc-atac-coverage.cwl | 2 +- tools/sc-atac-dbinding.cwl | 2 +- tools/sc-atac-reduce.cwl | 2 +- tools/sc-ctype-assign.cwl | 2 +- tools/sc-multiome-filter.cwl | 2 +- tools/sc-rna-cluster.cwl | 2 +- tools/sc-rna-da-cells.cwl | 2 +- tools/sc-rna-de-pseudobulk.cwl | 2 +- tools/sc-rna-filter.cwl | 2 +- tools/sc-rna-reduce.cwl | 2 +- tools/sc-rna-trajectory.cwl | 2 +- tools/sc-triangulate.cwl | 2 +- tools/sc-vdj-profile.cwl | 823 +++++++++++++++++++++++++++++++++ tools/sc-wnn-cluster.cwl | 2 +- workflows/sc-vdj-profile.cwl | 689 +++++++++++++++++++++++++++ 16 files changed, 1526 insertions(+), 14 deletions(-) create mode 100644 tools/sc-vdj-profile.cwl create mode 100644 workflows/sc-vdj-profile.cwl diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index 3055ee63..4e531a43 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.27 + dockerPull: biowardrobe2/sc-tools:v0.0.28 inputs: diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index e18dd718..ef1228f3 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.27 + dockerPull: biowardrobe2/sc-tools:v0.0.28 inputs: diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index fc4281e7..45dcf816 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.27 + dockerPull: biowardrobe2/sc-tools:v0.0.28 inputs: diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index 6b80e04b..e2057413 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.27 + dockerPull: biowardrobe2/sc-tools:v0.0.28 inputs: diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index 845a6d46..ad1fbf72 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.27 + dockerPull: biowardrobe2/sc-tools:v0.0.28 inputs: diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index 7b439a78..018a7f83 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.27 + dockerPull: biowardrobe2/sc-tools:v0.0.28 inputs: diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index 07acc149..c2d9ac29 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.27 + dockerPull: biowardrobe2/sc-tools:v0.0.28 inputs: diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index 23203919..3fcb9bcd 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.27 + dockerPull: biowardrobe2/sc-tools:v0.0.28 inputs: diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index 053247e2..36e0ec12 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.27 + dockerPull: biowardrobe2/sc-tools:v0.0.28 inputs: diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index adc0eaf4..053a0820 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.27 + dockerPull: biowardrobe2/sc-tools:v0.0.28 inputs: diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index d19478e9..dceddd20 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.27 + dockerPull: biowardrobe2/sc-tools:v0.0.28 inputs: diff --git a/tools/sc-rna-trajectory.cwl b/tools/sc-rna-trajectory.cwl index 691c375b..a070e33f 100644 --- a/tools/sc-rna-trajectory.cwl +++ b/tools/sc-rna-trajectory.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.27 + dockerPull: biowardrobe2/sc-tools:v0.0.28 inputs: diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index 8e58dfde..12490257 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.27 + dockerPull: biowardrobe2/sc-tools:v0.0.28 inputs: diff --git a/tools/sc-vdj-profile.cwl b/tools/sc-vdj-profile.cwl new file mode 100644 index 00000000..611c6785 --- /dev/null +++ b/tools/sc-vdj-profile.cwl @@ -0,0 +1,823 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement +- class: EnvVarRequirement + envDef: + R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/sc-tools:v0.0.28 + + +inputs: + + query_data_rds: + type: File + inputBinding: + prefix: "--query" + doc: | + Path to the RDS file to load Seurat object from. This + file should include gene expression information stored + in the RNA assay, as well as 'pca' and 'rnaumap' + dimensionality reductions applied to that assay. + + contigs_data: + type: File + inputBinding: + prefix: "--contigs" + doc: | + Path to the file with high-level annotations of each + high-confidence contig from cell-associated barcodes + from the Cell Ranger Multi or Cell Ranger Aggregate + experiments in TSV/CSV format. + + datasets_metadata: + type: File? + inputBinding: + prefix: "--metadata" + doc: | + Path to the TSV/CSV file to optionally extend Seurat + object metadata with categorical values using samples + identities. First column - 'library_id' should correspond + to all unique values from the 'new.ident' column of the + loaded Seurat object. If any of the provided in this file + columns are already present in the Seurat object metadata, + they will be overwritten. When combined with --barcodes + parameter, first the metadata will be extended, then barcode + filtering will be applied. Default: no extra metadata is added + + barcodes_data: + type: File? + inputBinding: + prefix: "--barcodes" + doc: | + Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata be selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones + if those are present. Default: all cells used, no + extra metadata is added + + query_source_column: + type: string + inputBinding: + prefix: "--source" + doc: | + Column from the metadata of the loaded Seurat + object to select clusters from. + + analysis_mode: + type: + - "null" + - type: enum + symbols: + - "tcr" + - "bcr" + inputBinding: + prefix: "--mode" + doc: | + Analysis mode. tcr: T-cell receptor with alpha, beta, + delta, and gamma chains. bcr: B-cell receptor with + heavy and light immunoglobulin chains. Default: tcr + + cloneby: + type: + - "null" + - type: enum + symbols: + - "gene" + - "nt" + - "aa" + - "strict" + inputBinding: + prefix: "--cloneby" + doc: | + Defines how to call the clonotype. gene: based on VDJC gene + sequence. nt: based on the nucleotide sequence. aa: based on + the amino acid sequence. strict: based on the combination of + the nucleotide and gene sequences. Default: gene + + groupby: + type: string? + inputBinding: + prefix: "--groupby" + doc: | + Column from the metadata of the loaded Seurat object + to group cells for clonotype frequency calculation. + Default: group by dataset + + strictness: + type: + - "null" + - type: enum + symbols: + - "removemulti" + - "filtermulti" + inputBinding: + prefix: "--strictness" + doc: | + Apply stringency filters. Removemulti: remove any cell + with more than 2 immune receptor chains. Filtermulti: + isolate the top 2 expressed chains in cell with multiple + chains. Default: do not apply any filters + + export_pdf_plots: + type: boolean? + inputBinding: + prefix: "--pdf" + doc: | + Export plots in PDF. + Default: false + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + inputBinding: + prefix: "--theme" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + + verbose: + type: boolean? + inputBinding: + prefix: "--verbose" + doc: | + Print debug information. + Default: false + + export_h5seurat_data: + type: boolean? + inputBinding: + prefix: "--h5seurat" + doc: | + Save Seurat data to h5seurat file. + Default: false + + export_h5ad_data: + type: boolean? + inputBinding: + prefix: "--h5ad" + doc: | + Save Seurat data to h5ad file. + Default: false + + export_scope_data: + type: boolean? + inputBinding: + prefix: "--scope" + doc: | + Save Seurat data to SCope compatible + loom file. Only not normalized raw + counts from the RNA assay will be + saved. If loaded Seurat object doesn't + have RNA assay this parameter will be + ignored. Default: false + + export_ucsc_cb: + type: boolean? + inputBinding: + prefix: "--cbbuild" + doc: | + Export results to UCSC Cell Browser. Default: false + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix. + Default: ./sc + + parallel_memory_limit: + type: int? + inputBinding: + prefix: "--memory" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Default: 32 + + vector_memory_limit: + type: int? + default: 128 + doc: | + Maximum vector memory in GB allowed to be used by R. + Default: 128 + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. + Default: 1 + + +outputs: + + count_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_count_spl_idnt.png" + doc: | + Unique clonotypes, + split by dataset + PNG format + + count_spl_idnt_plot_pdf: + type: File? + outputBinding: + glob: "*_count_spl_idnt.pdf" + doc: | + Unique clonotypes, + split by dataset + PDF format + + count_spl_clst_plot_png: + type: File? + outputBinding: + glob: "*_count_spl_clst.png" + doc: | + Unique clonotypes, + split by cluster + PNG format + + count_spl_clst_plot_pdf: + type: File? + outputBinding: + glob: "*_count_spl_clst.pdf" + doc: | + Unique clonotypes, + split by cluster + PDF format + + hmst_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_hmst_spl_idnt.png" + doc: | + Clonal space homeostasis, + split by dataset + PNG format + + hmst_spl_idnt_plot_pdf: + type: File? + outputBinding: + glob: "*_hmst_spl_idnt.pdf" + doc: | + Clonal space homeostasis, + split by dataset + PDF format + + hmst_spl_clst_plot_png: + type: File? + outputBinding: + glob: "*_hmst_spl_clst.png" + doc: | + Clonal space homeostasis, + split by cluster + PNG format + + hmst_spl_clst_plot_pdf: + type: File? + outputBinding: + glob: "*_hmst_spl_clst.pdf" + doc: | + Clonal space homeostasis, + split by cluster + PDF format + + vrlp_spl_clst_plot_png: + type: File? + outputBinding: + glob: "*_vrlp_spl_clst.png" + doc: | + Clonotypes similarity, + split by cluster + PNG format + + vrlp_spl_clst_plot_pdf: + type: File? + outputBinding: + glob: "*_vrlp_spl_clst.pdf" + doc: | + Clonotypes similarity, + split by cluster + PDF format + + vrlp_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_vrlp_spl_idnt.png" + doc: | + Clonotypes similarity, + split by dataset + PNG format + + vrlp_spl_idnt_plot_pdf: + type: File? + outputBinding: + glob: "*_vrlp_spl_idnt.pdf" + doc: | + Clonotypes similarity, + split by dataset + PDF format + + ntwr_gr_clst_plot_png: + type: File? + outputBinding: + glob: "*_ntwr_gr_clst.png" + doc: | + Clonotypes network, + colored by cluster + PNG format + + ntwr_gr_clst_plot_pdf: + type: File? + outputBinding: + glob: "*_ntwr_gr_clst.pdf" + doc: | + Clonotypes network, + colored by cluster + PDF format + + ntwr_gr_idnt_plot_png: + type: File? + outputBinding: + glob: "*_ntwr_gr_idnt.png" + doc: | + Clonotypes network, + colored by dataset + PNG format + + ntwr_gr_idnt_plot_pdf: + type: File? + outputBinding: + glob: "*_ntwr_gr_idnt.pdf" + doc: | + Clonotypes network, + colored by dataset + PDF format + + dvrs_gr_clst_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_dvrs_gr_clst_spl_idnt.png" + doc: | + Clonotypes diversity, + colored by cluster, + split by dataset + PNG format + + dvrs_gr_clst_spl_idnt_plot_pdf: + type: File? + outputBinding: + glob: "*_dvrs_gr_clst_spl_idnt.pdf" + doc: | + Clonotypes diversity, + colored by cluster, + split by dataset + PDF format + + dvrs_gr_idnt_spl_clst_plot_png: + type: File? + outputBinding: + glob: "*_dvrs_gr_idnt_spl_clst.png" + doc: | + Clonotypes diversity, + colored by dataset, + split by cluster + PNG format + + dvrs_gr_idnt_spl_clst_plot_pdf: + type: File? + outputBinding: + glob: "*_dvrs_gr_idnt_spl_clst.pdf" + doc: | + Clonotypes diversity, + colored by dataset, + split by cluster + PDF format + + gene_spl_clst_vdjc_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_gene_spl_clst_*.png" + doc: | + Relative usage of V, D, J, C + genes, split by cluster + PNG format + + gene_spl_clst_vdjc_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_gene_spl_clst_*.pdf" + doc: | + Relative usage of V, D, J, C + genes, split by cluster + PDF format + + gene_spl_idnt_vdjc_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_gene_spl_idnt_*.png" + doc: | + Relative usage of V, D, J, C + genes, split by dataset + PNG format + + gene_spl_idnt_vdjc_plot_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_gene_spl_idnt_*.pdf" + doc: | + Relative usage of V, D, J, C + genes, split by dataset + PDF format + + chrd_gr_clst_plot_png: + type: File? + outputBinding: + glob: "*_chrd_gr_clst.png" + doc: | + Shared clonotype, + colored by cluster + PNG format + + chrd_gr_clst_plot_pdf: + type: File? + outputBinding: + glob: "*_chrd_gr_clst.pdf" + doc: | + Shared clonotype, + colored by cluster + PDF format + + chrd_gr_idnt_plot_png: + type: File? + outputBinding: + glob: "*_chrd_gr_idnt.png" + doc: | + Shared clonotype, + colored by dataset + PNG format + + chrd_gr_idnt_plot_pdf: + type: File? + outputBinding: + glob: "*_chrd_gr_idnt.pdf" + doc: | + Shared clonotype, + colored by dataset + PDF format + + chrd_gr_cnd_plot_png: + type: File? + outputBinding: + glob: "*_chrd_gr_cnd.png" + doc: | + Shared clonotype, + colored by grouping condition + PNG format + + chrd_gr_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_chrd_gr_cnd.pdf" + doc: | + Shared clonotype, + colored by grouping condition + PDF format + + count_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_count_spl_cnd.png" + doc: | + Unique clonotypes, + split by grouping condition + PNG format + + count_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_count_spl_cnd.pdf" + doc: | + Unique clonotypes, + split by grouping condition + PDF format + + hmst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_hmst_spl_cnd.png" + doc: | + Clonal space homeostasis, + split by grouping condition + PNG format + + hmst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_hmst_spl_cnd.pdf" + doc: | + Clonal space homeostasis, + split by grouping condition + PDF format + + vrlp_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_vrlp_spl_cnd.png" + doc: | + Clonotypes similarity, + split by grouping condition + PNG format + + vrlp_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_vrlp_spl_cnd.pdf" + doc: | + Clonotypes similarity, + split by grouping condition + PDF format + + ntwr_gr_cnd_plot_png: + type: File? + outputBinding: + glob: "*_ntwr_gr_cnd.png" + doc: | + Clonotypes network, + colored by grouping condition + PNG format + + ntwr_gr_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_ntwr_gr_cnd.pdf" + doc: | + Clonotypes network, + colored by grouping condition + PDF format + + dvrs_gr_clst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_dvrs_gr_clst_spl_cnd.png" + doc: | + Clonotypes diversity, + colored by cluster, + split by grouping condition + PNG format + + dvrs_gr_clst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_dvrs_gr_clst_spl_cnd.pdf" + doc: | + Clonotypes diversity, + colored by cluster, + split by grouping condition + PDF format + + dvrs_gr_cnd_spl_clst_plot_png: + type: File? + outputBinding: + glob: "*_dvrs_gr_cnd_spl_clst.png" + doc: | + Clonotypes diversity, + colored by grouping condition, + split by cluster + PNG format + + dvrs_gr_cnd_spl_clst_plot_pdf: + type: File? + outputBinding: + glob: "*_dvrs_gr_cnd_spl_clst.pdf" + doc: | + Clonotypes diversity, + colored by grouping condition, + split by cluster + PDF format + + ucsc_cb_config_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser" + doc: | + Directory with UCSC Cellbrowser + configuration data. + + ucsc_cb_html_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser/html_data" + doc: | + Directory with UCSC Cellbrowser + html data. + + ucsc_cb_html_file: + type: File? + outputBinding: + glob: "*_cellbrowser/html_data/index.html" + doc: | + HTML index file from the directory + with UCSC Cellbrowser html data. + + seurat_data_rds: + type: File + outputBinding: + glob: "*_data.rds" + doc: | + Reduced Seurat data in RDS format + + seurat_data_h5seurat: + type: File? + outputBinding: + glob: "*_data.h5seurat" + doc: | + Reduced Seurat data in h5seurat format + + seurat_data_h5ad: + type: File? + outputBinding: + glob: "*_data.h5ad" + doc: | + Reduced Seurat data in h5ad format + + seurat_data_scope: + type: File? + outputBinding: + glob: "*_data.loom" + doc: | + Reduced Seurat data in SCope + compatible loom format + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["sc_vdj_profile.R"] + +stdout: sc_vdj_profile_stdout.log +stderr: sc_vdj_profile_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Single-cell Immune Profiling Analysis" +s:name: "Single-cell Immune Profiling Analysis" +s:alternateName: "TCR/BCR clonotype dynamics analysis" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-vdj-profile.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell Immune Profiling Analysis + + TCR/BCR clonotype dynamics analysis + + +s:about: | + usage: sc_vdj_profile.R [-h] --query QUERY --contigs CONTIGS + [--metadata METADATA] [--barcodes BARCODES] --source + SOURCE [--mode {tcr,bcr}] + [--cloneby {gene,nt,aa,strict}] [--groupby GROUPBY] + [--strictness {removemulti,filtermulti}] [--pdf] + [--verbose] [--h5seurat] [--h5ad] [--cbbuild] + [--scope] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + + Single-cell Immune Profiling Analysis + + optional arguments: + -h, --help show this help message and exit + --query QUERY Path to the RDS file to load Seurat object from. This + file should include gene expression information stored + in the RNA assay, as well as 'pca' and 'rnaumap' + dimensionality reductions applied to that assay. + --contigs CONTIGS Path to the file with high-level annotations of each + high-confidence contig from cell-associated barcodes + from the Cell Ranger Multi or Cell Ranger Aggregate + experiments in TSV/CSV format. + --metadata METADATA Path to the TSV/CSV file to optionally extend Seurat + object metadata with categorical values using samples + identities. First column - 'library_id' should + correspond to all unique values from the 'new.ident' + column of the loaded Seurat object. If any of the + provided in this file columns are already present in + the Seurat object metadata, they will be overwritten. + When combined with --barcodes parameter, first the + metadata will be extended, then barcode filtering will + be applied. Default: no extra metadata is added + --barcodes BARCODES Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata be selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. Default: all cells used, no extra + metadata is added + --source SOURCE Column from the metadata of the loaded Seurat object + to select clusters from. + --mode {tcr,bcr} Analysis mode. tcr: T-cell receptor with alpha, beta, + delta, and gamma chains. bcr: B-cell receptor with + heavy and light immunoglobulin chains. Default: tcr + --cloneby {gene,nt,aa,strict} + Defines how to call the clonotype. gene: based on VDJC + gene sequence. nt: based on the nucleotide sequence. + aa: based on the amino acid sequence. strict: based on + the combination of the nucleotide and gene sequences. + Default: gene + --groupby GROUPBY Column from the metadata of the loaded Seurat object + to group cells for clonotype frequency calculation. + Default: group by dataset + --strictness {removemulti,filtermulti} + Apply stringency filters. Removemulti: remove any cell + with more than 2 immune receptor chains. Filtermulti: + isolate the top 2 expressed chains in cell with + multiple chains. Default: do not apply any filters. + --pdf Export plots in PDF. Default: false + --verbose Print debug information. Default: false + --h5seurat Save Seurat data to h5seurat file. Default: false + --h5ad Save Seurat data to h5ad file. Default: false + --cbbuild Export results to UCSC Cell Browser. Default: false + --scope Save Seurat data to SCope compatible loom file. + Default: false + --output OUTPUT Output prefix. Default: ./sc + --theme {gray,bw,linedraw,light,dark,minimal,classic,void} + Color theme for all generated plots. Default: classic + --cpus CPUS Number of cores/cpus to use. Default: 1 + --memory MEMORY Maximum memory in GB allowed to be shared between the + workers when using multiple --cpus. Default: 32 \ No newline at end of file diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index 2d1f48c6..ff23fd11 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.27 + dockerPull: biowardrobe2/sc-tools:v0.0.28 inputs: diff --git a/workflows/sc-vdj-profile.cwl b/workflows/sc-vdj-profile.cwl new file mode 100644 index 00000000..1c5d18f9 --- /dev/null +++ b/workflows/sc-vdj-profile.cwl @@ -0,0 +1,689 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + + +'sd:upstream': + sc_tools_sample: + - "sc-rna-cluster.cwl" + - "sc-ctype-assign.cwl" + sc_vdj_sample: + - "cellranger-multi.cwl" + - "cellranger-aggr.cwl" + + +inputs: + + alias: + type: string + label: "Analysis name" + sd:preview: + position: 1 + + query_data_rds: + type: File + label: "Single-cell Analysis with Clustered RNA-Seq Datasets" + doc: | + Analysis that includes single-cell + RNA-Seq datasets run through either + "Single-cell Manual Cell Type + Assignment" or "Single-cell RNA-Seq + Cluster Analysis" at any of the + processing stages. + 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" + 'sd:localLabel': true + + contigs_data: + type: File + label: "Cell Ranger Immune Profiling Sample" + doc: | + "Cell Ranger Multi Gene Expression and + V(D)J Repertoire Profiling" or "Cell + Ranger Aggregate" sample to load high + level annotations of each high-confidence + contig from the cell-associated barcodes + 'sd:upstreamSource': "sc_vdj_sample/filtered_contig_annotations_csv" + 'sd:localLabel': true + + query_source_column: + type: string + label: "Cells grouping" + doc: | + Single cell metadata column to group + cells into clusters. Usually, in a form + of "[rna|atac|wsnn]_res.X", where X is + the clustering resolution. If cell types + are available, add "custom_" prefix to + the column name. + + analysis_mode: + type: + - "null" + - type: enum + symbols: + - "tcr" + - "bcr" + default: "tcr" + label: "Analysis mode" + doc: | + Analysis mode. tcr: T-cell receptor with + alpha, beta, delta, and gamma chains. + bcr: B-cell receptor with heavy and light + immunoglobulin chains. + Default: tcr + + cloneby: + type: + - "null" + - type: enum + symbols: + - "gene" + - "nt" + - "aa" + - "strict" + default: "gene" + label: "Clonotype calling" + doc: | + Defines how to call the clonotype. + gene: based on VDJC gene sequence. + nt: based on the nucleotide sequence. + aa: based on the amino acid sequence. + strict: based on the combination of + the nucleotide and gene sequences. + Default: gene + + strictness: + type: + - "null" + - type: enum + symbols: + - "removemulti" + - "filtermulti" + - "none" + default: "none" + label: "Stringency filter" + doc: | + Apply stringency filters. removemulti: + remove any cell with more than 2 immune + receptor chains. filtermulti: isolate + the top 2 expressed chains in cell with + multiple chains. none: do not apply any + filters. Default: none + + datasets_metadata: + type: File? + label: "Datasets metadata (optional)" + doc: | + If the selected single-cell analysis + includes multiple aggregated datasets, + each of them can be assigned to a + separate group by one or multiple + categories. This can be achieved by + providing a TSV/CSV file with + "library_id" as the first column and + any number of additional columns with + unique names, representing the desired + grouping categories. +# To obtain a proper this is not available yet, because we didn't refactor sc-rna-filter pipeline +# template of this file, download +# "datasets_metadata.tsv" output from the +# "Files" tab of the selected "Single-cell +# Analysis with Filtered RNA-Seq Datasets" +# and add extra columns as needed. + + barcodes_data: + type: File? + label: "Selected cell barcodes (optional)" + doc: | + A TSV/CSV file to optionally prefilter + the single cell data by including only + the cells with the selected barcodes. + The provided file should include at + least one column named "barcode", with + one cell barcode per line. All other + columns, except for "barcode", will be + added to the single cell metadata loaded + from "Single-cell Analysis with Clustered + RNA-Seq Datasets" and can be utilized in + the current or future steps of analysis. + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + default: "classic" + label: "Plots color theme" + doc: | + Color theme for all plots saved + as PNG files. + Default: classic + "sd:layout": + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + - "2" + default: "1" + label: "Cores/CPUs" + doc: | + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 1 + "sd:layout": + advanced: true + + +outputs: + + count_spl_idnt_plot_png: + type: File? + outputSource: vdj_profile/count_spl_idnt_plot_png + label: "Unique clonotypes, split by dataset" + doc: | + Unique clonotypes, + split by dataset + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Unique clonotypes, split by dataset' + + hmst_spl_idnt_plot_png: + type: File? + outputSource: vdj_profile/hmst_spl_idnt_plot_png + label: "Clonal space homeostasis, split by dataset" + doc: | + Clonal space homeostasis, + split by dataset + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Clonal space homeostasis, split by dataset' + + vrlp_spl_idnt_plot_png: + type: File? + outputSource: vdj_profile/vrlp_spl_idnt_plot_png + label: "Clonotypes similarity, split by dataset" + doc: | + Clonotypes similarity, + split by dataset + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Clonotypes similarity, split by dataset' + + ntwr_gr_idnt_plot_png: + type: File? + outputSource: vdj_profile/ntwr_gr_idnt_plot_png + label: "Clonotypes network, colored by dataset" + doc: | + Clonotypes network, + colored by dataset + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Clonotypes network, colored by dataset' + + dvrs_gr_clst_spl_idnt_plot_png: + type: File? + outputSource: vdj_profile/dvrs_gr_clst_spl_idnt_plot_png + label: "Clonotypes diversity, colored by cluster, split by dataset" + doc: | + Clonotypes diversity, + colored by cluster, + split by dataset + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Clonotypes diversity, colored by cluster, split by dataset' + + chrd_gr_idnt_plot_png: + type: File? + outputSource: vdj_profile/chrd_gr_idnt_plot_png + label: "Shared clonotype, colored by dataset" + doc: | + Shared clonotype, + colored by dataset + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Shared clonotype, colored by dataset' + + gene_spl_idnt_vdjc_plot_png: + type: + - "null" + - type: array + items: File + outputSource: vdj_profile/gene_spl_idnt_vdjc_plot_png + label: "Relative usage of V, D, J, C genes, split by dataset" + doc: | + Relative usage of V, D, J, C + genes, split by dataset + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Relative usage of V, D, J, C genes, split by dataset' + + count_spl_clst_plot_png: + type: File? + outputSource: vdj_profile/count_spl_clst_plot_png + label: "Unique clonotypes, split by cluster" + doc: | + Unique clonotypes, + split by cluster + 'sd:visualPlugins': + - image: + tab: 'Per cluster' + Caption: 'Unique clonotypes, split by cluster' + + hmst_spl_clst_plot_png: + type: File? + outputSource: vdj_profile/hmst_spl_clst_plot_png + label: "Clonal space homeostasis, split by cluster" + doc: | + Clonal space homeostasis, + split by cluster + 'sd:visualPlugins': + - image: + tab: 'Per cluster' + Caption: 'Clonal space homeostasis, split by cluster' + + vrlp_spl_clst_plot_png: + type: File? + outputSource: vdj_profile/vrlp_spl_clst_plot_png + label: "Clonotypes similarity, split by cluster" + doc: | + Clonotypes similarity, + split by cluster + 'sd:visualPlugins': + - image: + tab: 'Per cluster' + Caption: 'Clonotypes similarity, split by cluster' + + ntwr_gr_clst_plot_png: + type: File? + outputSource: vdj_profile/ntwr_gr_clst_plot_png + label: "Clonotypes network, colored by cluster" + doc: | + Clonotypes network, + colored by cluster + 'sd:visualPlugins': + - image: + tab: 'Per cluster' + Caption: 'Clonotypes network, colored by cluster' + + dvrs_gr_idnt_spl_clst_plot_png: + type: File? + outputSource: vdj_profile/dvrs_gr_idnt_spl_clst_plot_png + label: "Clonotypes diversity, colored by dataset, split by cluster" + doc: | + Clonotypes diversity, + colored by dataset, + split by cluster + 'sd:visualPlugins': + - image: + tab: 'Per cluster' + Caption: 'Clonotypes diversity, colored by dataset, split by cluster' + + chrd_gr_clst_plot_png: + type: File? + outputSource: vdj_profile/chrd_gr_clst_plot_png + label: "Shared clonotype, colored by cluster" + doc: | + Shared clonotype, + colored by cluster + 'sd:visualPlugins': + - image: + tab: 'Per cluster' + Caption: 'Shared clonotype, colored by cluster' + + gene_spl_clst_vdjc_plot_png: + type: + - "null" + - type: array + items: File + outputSource: vdj_profile/gene_spl_clst_vdjc_plot_png + label: "Relative usage of V, D, J, C genes, split by cluster" + doc: | + Relative usage of V, D, J, C + genes, split by cluster + 'sd:visualPlugins': + - image: + tab: 'Per cluster' + Caption: 'Relative usage of V, D, J, C genes, split by cluster' + + count_spl_cnd_plot_png: + type: File? + outputSource: vdj_profile/count_spl_cnd_plot_png + label: "Unique clonotypes, split by grouping condition" + doc: | + Unique clonotypes, + split by grouping + condition + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Unique clonotypes, split by grouping condition' + + hmst_spl_cnd_plot_png: + type: File? + outputSource: vdj_profile/hmst_spl_cnd_plot_png + label: "Clonal space homeostasis, split by grouping condition" + doc: | + Clonal space homeostasis, + split by grouping condition + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Clonal space homeostasis, split by grouping condition' + + vrlp_spl_cnd_plot_png: + type: File? + outputSource: vdj_profile/vrlp_spl_cnd_plot_png + label: "Clonotypes similarity, split by grouping condition" + doc: | + Clonotypes similarity, + split by grouping condition + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Clonotypes similarity, split by grouping condition' + + ntwr_gr_cnd_plot_png: + type: File? + outputSource: vdj_profile/ntwr_gr_cnd_plot_png + label: "Clonotypes network, colored by grouping condition" + doc: | + Clonotypes network, + colored by grouping condition + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Clonotypes network, colored by grouping condition' + + dvrs_gr_clst_spl_cnd_plot_png: + type: File? + outputSource: vdj_profile/dvrs_gr_clst_spl_cnd_plot_png + label: "Clonotypes diversity, colored by cluster, split by grouping condition" + doc: | + Clonotypes diversity, + colored by cluster, + split by grouping condition + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Clonotypes diversity, colored by cluster, split by grouping condition' + + dvrs_gr_cnd_spl_clst_plot_png: + type: File? + outputSource: vdj_profile/dvrs_gr_cnd_spl_clst_plot_png + label: "Clonotypes diversity, colored by grouping condition, split by cluster" + doc: | + Clonotypes diversity, + colored by grouping condition, + split by cluster + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Clonotypes diversity, colored by grouping condition, split by cluster' + + chrd_gr_cnd_plot_png: + type: File? + outputSource: vdj_profile/chrd_gr_cnd_plot_png + label: "Shared clonotype, colored by grouping condition" + doc: | + Shared clonotype, + colored by grouping + condition + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Shared clonotype, colored by grouping condition' + + ucsc_cb_html_data: + type: Directory? + outputSource: vdj_profile/ucsc_cb_html_data + label: "UCSC Cell Browser data" + doc: | + Directory with UCSC Cell Browser + data + + ucsc_cb_html_file: + type: File? + outputSource: vdj_profile/ucsc_cb_html_file + label: "UCSC Cell Browser" + doc: | + UCSC Cell Browser HTML index file + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + + seurat_data_rds: + type: File + outputSource: vdj_profile/seurat_data_rds + label: "Processed Seurat data in RDS format" + doc: | + Processed Seurat data in RDS format + + seurat_data_scope: + type: File? + outputSource: vdj_profile/seurat_data_scope + label: "Processed Seurat data in SCope compatible loom format" + doc: | + Processed Seurat data in SCope compatible loom format + + pdf_plots: + type: File + outputSource: compress_pdf_plots/compressed_folder + label: "Plots in PDF format" + doc: | + Compressed folder with plots + in PDF format + + vdj_profile_stdout_log: + type: File + outputSource: vdj_profile/stdout_log + label: "stdout log generated by vdj_profile step" + doc: | + stdout log generated by vdj_profile step + + vdj_profile_stderr_log: + type: File + outputSource: vdj_profile/stderr_log + label: "stderr log generated by vdj_profile step" + doc: | + stderr log generated by vdj_profile step + + +steps: + + vdj_profile: + run: ../tools/sc-vdj-profile.cwl + in: + query_data_rds: query_data_rds + contigs_data: contigs_data + datasets_metadata: datasets_metadata + barcodes_data: barcodes_data + query_source_column: query_source_column + analysis_mode: analysis_mode + cloneby: cloneby + groupby: + default: "new.ident" + strictness: + source: strictness + valueFrom: $(self=="none"?null:self) + color_theme: color_theme + export_pdf_plots: + default: true + verbose: + default: true + export_ucsc_cb: + default: true + export_scope_data: + default: true + parallel_memory_limit: + default: 32 + vector_memory_limit: + default: 96 + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - count_spl_idnt_plot_png + - count_spl_idnt_plot_pdf + - count_spl_clst_plot_png + - count_spl_clst_plot_pdf + - hmst_spl_idnt_plot_png + - hmst_spl_idnt_plot_pdf + - hmst_spl_clst_plot_png + - hmst_spl_clst_plot_pdf + - vrlp_spl_clst_plot_png + - vrlp_spl_clst_plot_pdf + - vrlp_spl_idnt_plot_png + - vrlp_spl_idnt_plot_pdf + - ntwr_gr_clst_plot_png + - ntwr_gr_clst_plot_pdf + - ntwr_gr_idnt_plot_png + - ntwr_gr_idnt_plot_pdf + - dvrs_gr_clst_spl_idnt_plot_png + - dvrs_gr_clst_spl_idnt_plot_pdf + - dvrs_gr_idnt_spl_clst_plot_png + - dvrs_gr_idnt_spl_clst_plot_pdf + - gene_spl_clst_vdjc_plot_png + - gene_spl_clst_vdjc_plot_pdf + - gene_spl_idnt_vdjc_plot_png + - gene_spl_idnt_vdjc_plot_pdf + - chrd_gr_clst_plot_png + - chrd_gr_clst_plot_pdf + - chrd_gr_idnt_plot_png + - chrd_gr_idnt_plot_pdf + - chrd_gr_cnd_plot_png + - chrd_gr_cnd_plot_pdf + - count_spl_cnd_plot_png + - count_spl_cnd_plot_pdf + - hmst_spl_cnd_plot_png + - hmst_spl_cnd_plot_pdf + - vrlp_spl_cnd_plot_png + - vrlp_spl_cnd_plot_pdf + - ntwr_gr_cnd_plot_png + - ntwr_gr_cnd_plot_pdf + - dvrs_gr_clst_spl_cnd_plot_png + - dvrs_gr_clst_spl_cnd_plot_pdf + - dvrs_gr_cnd_spl_clst_plot_png + - dvrs_gr_cnd_spl_clst_plot_pdf + - ucsc_cb_html_data + - ucsc_cb_html_file + - seurat_data_rds + - seurat_data_scope + - stdout_log + - stderr_log + + pdf_plots: + run: ../tools/files-to-folder.cwl + in: + input_files: + source: + - vdj_profile/count_spl_idnt_plot_pdf + - vdj_profile/count_spl_clst_plot_pdf + - vdj_profile/hmst_spl_idnt_plot_pdf + - vdj_profile/hmst_spl_clst_plot_pdf + - vdj_profile/vrlp_spl_clst_plot_pdf + - vdj_profile/vrlp_spl_idnt_plot_pdf + - vdj_profile/ntwr_gr_clst_plot_pdf + - vdj_profile/ntwr_gr_idnt_plot_pdf + - vdj_profile/dvrs_gr_clst_spl_idnt_plot_pdf + - vdj_profile/dvrs_gr_idnt_spl_clst_plot_pdf + - vdj_profile/gene_spl_clst_vdjc_plot_pdf + - vdj_profile/gene_spl_idnt_vdjc_plot_pdf + - vdj_profile/chrd_gr_clst_plot_pdf + - vdj_profile/chrd_gr_idnt_plot_pdf + - vdj_profile/chrd_gr_cnd_plot_pdf + - vdj_profile/count_spl_cnd_plot_pdf + - vdj_profile/hmst_spl_cnd_plot_pdf + - vdj_profile/vrlp_spl_cnd_plot_pdf + - vdj_profile/ntwr_gr_cnd_plot_pdf + - vdj_profile/dvrs_gr_clst_spl_cnd_plot_pdf + - vdj_profile/dvrs_gr_cnd_spl_clst_plot_pdf + valueFrom: $(self.flat().filter(n => n)) + folder_basename: + default: "pdf_plots" + out: + - folder + + compress_pdf_plots: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: pdf_plots/folder + out: + - compressed_folder + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Single-cell Immune Profiling Analysis" +s:name: "Single-cell Immune Profiling Analysis" +s:alternateName: "TCR/BCR clonotype dynamics analysis" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-vdj-profile.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell Immune Profiling Analysis + + TCR/BCR clonotype dynamics analysis \ No newline at end of file From 284ba6b4fed62be47849172ecfdc8570114241c9 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 23 Aug 2023 13:06:18 -0400 Subject: [PATCH 066/162] Update volcano plot image to the latest --- tools/ma-plot.cwl | 2 +- tools/volcano-plot.cwl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/ma-plot.cwl b/tools/ma-plot.cwl index da5bbf74..737dbba0 100644 --- a/tools/ma-plot.cwl +++ b/tools/ma-plot.cwl @@ -4,7 +4,7 @@ class: CommandLineTool hints: - class: DockerRequirement - dockerPull: biowardrobe2/visualization:v0.0.7 + dockerPull: biowardrobe2/visualization:v0.0.8 inputs: diff --git a/tools/volcano-plot.cwl b/tools/volcano-plot.cwl index c834dc22..8cebb00a 100644 --- a/tools/volcano-plot.cwl +++ b/tools/volcano-plot.cwl @@ -4,7 +4,7 @@ class: CommandLineTool hints: - class: DockerRequirement - dockerPull: biowardrobe2/visualization:v0.0.7 + dockerPull: biowardrobe2/visualization:v0.0.8 inputs: From 232418562a33642a9b618c26101b5d341ebf371f Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 24 Aug 2023 12:14:29 -0400 Subject: [PATCH 067/162] Update gene markers heatmap --- tools/sc-atac-cluster.cwl | 2 +- tools/sc-atac-coverage.cwl | 2 +- tools/sc-atac-dbinding.cwl | 2 +- tools/sc-atac-reduce.cwl | 2 +- tools/sc-ctype-assign.cwl | 11 ++++++++++- tools/sc-multiome-filter.cwl | 2 +- tools/sc-rna-cluster.cwl | 14 +++++++++++++- tools/sc-rna-da-cells.cwl | 2 +- tools/sc-rna-de-pseudobulk.cwl | 2 +- tools/sc-rna-filter.cwl | 2 +- tools/sc-rna-reduce.cwl | 2 +- tools/sc-rna-trajectory.cwl | 2 +- tools/sc-triangulate.cwl | 2 +- tools/sc-vdj-profile.cwl | 2 +- tools/sc-wnn-cluster.cwl | 14 +++++++++++++- workflows/sc-ctype-assign.cwl | 9 +++++++++ workflows/sc-rna-cluster.cwl | 12 ++++++++++++ workflows/sc-wnn-cluster.cwl | 12 ++++++++++++ 18 files changed, 81 insertions(+), 15 deletions(-) diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index 4e531a43..6b4800cc 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.28 + dockerPull: biowardrobe2/sc-tools:v0.0.29 inputs: diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index ef1228f3..95dc12e5 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.28 + dockerPull: biowardrobe2/sc-tools:v0.0.29 inputs: diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index 45dcf816..77e0ac96 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.28 + dockerPull: biowardrobe2/sc-tools:v0.0.29 inputs: diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index e2057413..500dc9a6 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.28 + dockerPull: biowardrobe2/sc-tools:v0.0.29 inputs: diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index ad1fbf72..dcfff3e7 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.28 + dockerPull: biowardrobe2/sc-tools:v0.0.29 inputs: @@ -832,6 +832,15 @@ outputs: Gene expression heatmap. PDF format + xpr_htmp_tsv: + type: File? + outputBinding: + glob: "*_xpr_htmp.tsv" + doc: | + Gene markers used for gene + expression heatmap. + TSV format + gene_markers_tsv: type: File? outputBinding: diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index 018a7f83..fa42a47f 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.28 + dockerPull: biowardrobe2/sc-tools:v0.0.29 inputs: diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index c2d9ac29..3fbfc7a6 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.28 + dockerPull: biowardrobe2/sc-tools:v0.0.29 inputs: @@ -633,6 +633,18 @@ outputs: Gene expression heatmap. PDF format + xpr_htmp_res_tsv: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_htmp_res_*.tsv" + doc: | + Gene markers used for gene + expression heatmap. + TSV format + gene_markers_tsv: type: File? outputBinding: diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index 3fcb9bcd..5818eb34 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.28 + dockerPull: biowardrobe2/sc-tools:v0.0.29 inputs: diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index 36e0ec12..1b1ba60f 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.28 + dockerPull: biowardrobe2/sc-tools:v0.0.29 inputs: diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index 053a0820..cb26b8d7 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.28 + dockerPull: biowardrobe2/sc-tools:v0.0.29 inputs: diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index dceddd20..745b8998 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.28 + dockerPull: biowardrobe2/sc-tools:v0.0.29 inputs: diff --git a/tools/sc-rna-trajectory.cwl b/tools/sc-rna-trajectory.cwl index a070e33f..0efcf0fc 100644 --- a/tools/sc-rna-trajectory.cwl +++ b/tools/sc-rna-trajectory.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.28 + dockerPull: biowardrobe2/sc-tools:v0.0.29 inputs: diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index 12490257..47ac9f0d 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.28 + dockerPull: biowardrobe2/sc-tools:v0.0.29 inputs: diff --git a/tools/sc-vdj-profile.cwl b/tools/sc-vdj-profile.cwl index 611c6785..41e2efd3 100644 --- a/tools/sc-vdj-profile.cwl +++ b/tools/sc-vdj-profile.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.28 + dockerPull: biowardrobe2/sc-tools:v0.0.29 inputs: diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index ff23fd11..0f351246 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.28 + dockerPull: biowardrobe2/sc-tools:v0.0.29 inputs: @@ -773,6 +773,18 @@ outputs: Gene expression heatmap. PDF format + xpr_htmp_res_tsv: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_htmp_res_*.tsv" + doc: | + Gene markers used for gene + expression heatmap. + TSV format + gene_markers_tsv: type: File? outputBinding: diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index 56518b97..0f268d34 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -547,6 +547,14 @@ outputs: tab: 'Genome coverage' Caption: 'Fragments coverage' + xpr_htmp_tsv: + type: File? + outputSource: ctype_assign/xpr_htmp_tsv + label: "Markers from gene expression heatmap" + doc: | + Gene markers used for gene + expression heatmap + gene_markers_tsv: type: File? outputSource: ctype_assign/gene_markers_tsv @@ -731,6 +739,7 @@ steps: - xpr_per_cell_sgnl_rd_wnnumap_plot_pdf - cvrg_plot_pdf - xpr_htmp_plot_pdf + - xpr_htmp_tsv - gene_markers_tsv - peak_markers_tsv - ucsc_cb_html_data diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index f35714e5..df02fc40 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -383,6 +383,17 @@ outputs: tab: 'Heatmap' Caption: 'Gene expression heatmap' + xpr_htmp_res_tsv: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/xpr_htmp_res_tsv + label: "Markers from gene expression heatmap" + doc: | + Gene markers used for gene + expression heatmap + gene_markers_tsv: type: File? outputSource: sc_rna_cluster/gene_markers_tsv @@ -526,6 +537,7 @@ steps: - xpr_per_cell_sgnl_plot_pdf - xpr_dnst_res_plot_pdf - xpr_htmp_res_plot_pdf + - xpr_htmp_res_tsv - gene_markers_tsv - ucsc_cb_html_data - ucsc_cb_html_file diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index 8caa52de..f881130a 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -431,6 +431,17 @@ outputs: tab: 'Genome coverage' Caption: 'Fragments coverage' + xpr_htmp_res_tsv: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/xpr_htmp_res_tsv + label: "Markers from gene expression heatmap" + doc: | + Gene markers used for gene + expression heatmap + gene_markers_tsv: type: File? outputSource: sc_wnn_cluster/gene_markers_tsv @@ -596,6 +607,7 @@ steps: - xpr_dnst_res_plot_pdf - cvrg_res_plot_pdf - xpr_htmp_res_plot_pdf + - xpr_htmp_res_tsv - gene_markers_tsv - peak_markers_tsv - ucsc_cb_html_data From 9024989569d38323f36a677a973e968ddc358b59 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 28 Aug 2023 17:51:09 -0400 Subject: [PATCH 068/162] Update sc RNA reduce to support regex input for removing the influence of certain genes --- tools/sc-rna-reduce.cwl | 16 +++++++--------- workflows/sc-rna-reduce.cwl | 7 ++++--- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index 745b8998..d8430d02 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -148,15 +148,12 @@ inputs: Default: false regress_genes: - type: - - "null" - - string - - string[] + type: string? inputBinding: prefix: "--regressgenes" doc: | - Genes which expression should be regressed as a confounding source of variation. - Default: None + Regex pattern to identify genes which expression should be + regressed as a confounding source of variation. Default: none regress_ccycle_full: type: boolean? @@ -867,9 +864,10 @@ s:about: | --regressmt Regress the percentage of transcripts mapped to mitochondrial genes as a confounding source of variation. Default: false - --regressgenes [REGRESSGENES [REGRESSGENES ...]] - Genes which expression should be regressed as a - confounding source of variation. Default: None + --regressgenes REGRESSGENES + Regex pattern to identify genes which expression + should be regressed as a confounding source of variation. + Default: none --regressccfull Regress all signals associated with cell cycle phase. Ignored if --cellcycle is not provided. Mutually exclusive with --regressccdiff parameter. Default: diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index b93ee22a..d7dfc0a9 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -171,8 +171,9 @@ inputs: label: "Regress genes" default: null doc: | - Regress expression of the selected genes - as a confounding source of variation. + Regex pattern to identify genes which + expression should be regressed as a + confounding source of variation. Default: None regress_mito_perc: @@ -598,7 +599,7 @@ steps: regress_mito_perc: regress_mito_perc regress_genes: source: regress_genes - valueFrom: $(split_features(self)) + valueFrom: $(self==""?null:self) # safety measure dimensions: dimensions verbose: default: true From db115b71af06411982a4fef2e0cfc97d2645aa25 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 1 Sep 2023 15:42:42 -0400 Subject: [PATCH 069/162] Update VDJ pipeline to produce combined plots --- tools/sc-vdj-profile.cwl | 21 ++------------------- workflows/sc-vdj-profile.cwl | 17 ----------------- 2 files changed, 2 insertions(+), 36 deletions(-) diff --git a/tools/sc-vdj-profile.cwl b/tools/sc-vdj-profile.cwl index 41e2efd3..6c3ce928 100644 --- a/tools/sc-vdj-profile.cwl +++ b/tools/sc-vdj-profile.cwl @@ -72,20 +72,6 @@ inputs: Column from the metadata of the loaded Seurat object to select clusters from. - analysis_mode: - type: - - "null" - - type: enum - symbols: - - "tcr" - - "bcr" - inputBinding: - prefix: "--mode" - doc: | - Analysis mode. tcr: T-cell receptor with alpha, beta, - delta, and gamma chains. bcr: B-cell receptor with - heavy and light immunoglobulin chains. Default: tcr - cloneby: type: - "null" @@ -751,8 +737,8 @@ doc: | s:about: | usage: sc_vdj_profile.R [-h] --query QUERY --contigs CONTIGS - [--metadata METADATA] [--barcodes BARCODES] --source - SOURCE [--mode {tcr,bcr}] + [--metadata METADATA] [--barcodes BARCODES] + --source SOURCE [--cloneby {gene,nt,aa,strict}] [--groupby GROUPBY] [--strictness {removemulti,filtermulti}] [--pdf] [--verbose] [--h5seurat] [--h5ad] [--cbbuild] @@ -791,9 +777,6 @@ s:about: | metadata is added --source SOURCE Column from the metadata of the loaded Seurat object to select clusters from. - --mode {tcr,bcr} Analysis mode. tcr: T-cell receptor with alpha, beta, - delta, and gamma chains. bcr: B-cell receptor with - heavy and light immunoglobulin chains. Default: tcr --cloneby {gene,nt,aa,strict} Defines how to call the clonotype. gene: based on VDJC gene sequence. nt: based on the nucleotide sequence. diff --git a/workflows/sc-vdj-profile.cwl b/workflows/sc-vdj-profile.cwl index 1c5d18f9..12b5e598 100644 --- a/workflows/sc-vdj-profile.cwl +++ b/workflows/sc-vdj-profile.cwl @@ -62,22 +62,6 @@ inputs: are available, add "custom_" prefix to the column name. - analysis_mode: - type: - - "null" - - type: enum - symbols: - - "tcr" - - "bcr" - default: "tcr" - label: "Analysis mode" - doc: | - Analysis mode. tcr: T-cell receptor with - alpha, beta, delta, and gamma chains. - bcr: B-cell receptor with heavy and light - immunoglobulin chains. - Default: tcr - cloneby: type: - "null" @@ -525,7 +509,6 @@ steps: datasets_metadata: datasets_metadata barcodes_data: barcodes_data query_source_column: query_source_column - analysis_mode: analysis_mode cloneby: cloneby groupby: default: "new.ident" From e9b2435573c3555fadebfff95d1cf7c71ce06899 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 5 Sep 2023 14:28:21 -0400 Subject: [PATCH 070/162] Allow to select dimensionality in the trajectory analysis --- workflows/sc-rna-trajectory.cwl | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/workflows/sc-rna-trajectory.cwl b/workflows/sc-rna-trajectory.cwl index 5af30827..1b3455e2 100644 --- a/workflows/sc-rna-trajectory.cwl +++ b/workflows/sc-rna-trajectory.cwl @@ -15,10 +15,6 @@ requirements: - "sc-atac-cluster.cwl" - "sc-wnn-cluster.cwl" - "sc-ctype-assign.cwl" - - "https://github.com/datirium/workflows/workflows/sc-rna-cluster.cwl" - - "https://github.com/datirium/workflows/workflows/sc-atac-cluster.cwl" - - "https://github.com/datirium/workflows/workflows/sc-wnn-cluster.cwl" - - "https://github.com/datirium/workflows/workflows/sc-ctype-assign.cwl" inputs: @@ -45,6 +41,20 @@ inputs: 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" 'sd:localLabel': true + dimensions: + type: int? + label: "Target dimensionality" + default: 0 + doc: | + Number of principal components to be used + in the trajectory analysis. Accepted values + range from 1 to 50. Will fail if used more + dimensions than it was available in the + selected "Single-cell Analysis with + Clustered RNA-Seq Datasets". If set to 0, + use all available dimensions + Default: 0 + query_source_column: type: string label: "Cells grouping" @@ -384,6 +394,9 @@ steps: barcodes_data: barcodes_data reduction: default: "pca" + dimensions: + source: dimensions + valueFrom: $(self==0?null:self) # to use all available dimensions query_source_column: query_source_column trajectory_start: source: trajectory_start From 2f84eeeb49c975899b23f970ae260b8e8eb5d45a Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 12 Sep 2023 12:10:57 -0400 Subject: [PATCH 071/162] Update diff expr workflow to use Combat-Seq to remove batch effect --- tools/deseq-multi-factor.cwl | 34 +++++----- workflows/deseq-multi-factor.cwl | 86 ++++++++++++++------------ workflows/filter-deseq-for-heatmap.cwl | 7 ++- 3 files changed, 69 insertions(+), 58 deletions(-) diff --git a/tools/deseq-multi-factor.cwl b/tools/deseq-multi-factor.cwl index d4022eb1..26c39da8 100644 --- a/tools/deseq-multi-factor.cwl +++ b/tools/deseq-multi-factor.cwl @@ -8,7 +8,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/deseq:v0.0.4 + dockerPull: biowardrobe2/deseq:v0.0.5 inputs: @@ -133,11 +133,11 @@ inputs: inputBinding: prefix: "--remove" doc: | - Column from the metadata file to remove batch effect when - exporting feature counts. All components that include this - term will be removed from the design formula when correcting - for batch effect. Default: do not remove batch effect from - the exported counts + Column from the metadata file to remove batch effect + before running differential expression analysis. If + present, all components that include this term will be + removed from the design and reduced formulas. + Default: do not remove batch effect cluster_method: type: @@ -276,8 +276,9 @@ outputs: outputBinding: glob: "*_pca_plot.png" doc: | - PCA plot of normalized counts based on the top 500 - features selected by the highest row variance + PCA plot of normalized, optionally batch corrected, + read counts based on the top 500 features selected + by the highest row variance PNG format pca_plot_pdf: @@ -285,8 +286,9 @@ outputs: outputBinding: glob: "*_pca_plot.pdf" doc: | - PCA plot of normalized counts based on the top 500 - features selected by the highest row variance + PCA plot of normalized, optionally batch corrected, + read counts based on the top 500 features selected + by the highest row variance PDF format mds_plot_html: @@ -294,8 +296,8 @@ outputs: outputBinding: glob: "*_mds_plot.html" doc: | - MDS plot of normalized counts. Optionally batch corrected - based on the --remove value. + MDS plot of normalized, optionally batch corrected, + read counts HTML format stdout_log: @@ -422,10 +424,10 @@ s:about: | 30), when there is a wide range of sequencing depth across samples. Default: vst --remove REMOVE Column from the metadata file to remove batch effect - when exporting feature counts. All components that - include this term will be removed from the design - formula when correcting for batch effect. Default: do - not remove batch effect from the exported counts + before running differential expression analysis. If + present, all components that include this term will be + removed from the design and reduced formulas. + Default: do not remove batch effect --cluster {row,column,both} Hopach clustering method to be run on normalized read counts for the exploratory visualization analysis. diff --git a/workflows/deseq-multi-factor.cwl b/workflows/deseq-multi-factor.cwl index 3af3682b..897293c0 100644 --- a/workflows/deseq-multi-factor.cwl +++ b/workflows/deseq-multi-factor.cwl @@ -115,13 +115,13 @@ inputs: remove: type: string? - label: "Column from the metadata file to remove batch effect when exporting feature counts" + label: "Column from the metadata file to remove batch effect" doc: | - Column from the metadata file to remove batch effect when - exporting feature counts. All components that include this - term will be removed from the design formula when correcting - for batch effect. Default: do not remove batch effect from - the exported counts + Column from the metadata file to remove batch effect + before running differential expression analysis. If + present, all components that include this term will be + removed from the design and reduced formulas. + Default: do not remove batch effect base: type: string? @@ -152,25 +152,14 @@ inputs: symbols: - "vst" - "rlog" - default: "vst" + default: "rlog" label: "Read counts normalization for the exploratory visualization analysis" doc: | Read counts normalization for the exploratory visualization analysis. Use 'vst' for medium-to-large datasets (n > 30) and 'rlog' for small datasets (n < 30), when there is a wide range of sequencing depth across samples. - Default: vst - 'sd:layout': - advanced: true - - center_row: - type: boolean? - default: false - label: "Apply mean centering for feature expression prior to running clustering by row" - doc: | - Apply mean centering for feature expression prior to running - clustering by row. Ignored when --cluster is not row or both. - Default: do not centered + Default: rlog 'sd:layout': advanced: true @@ -259,22 +248,34 @@ inputs: advanced: true threads: - type: int? - default: 1 - label: "Number of cores/cpus to use" - doc: "Number of cores/cpus to use. Default: 1" - 'sd:layout': + type: + - "null" + - type: enum + symbols: + - "1" + - "2" + - "3" + - "4" + default: "1" + label: "Cores/CPUs" + doc: | + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 1 + "sd:layout": advanced: true outputs: - diff_expr_features: + diff_expr_file: type: File outputSource: deseq_multi_factor/diff_expr_features label: "TSV file with not filtered differentially expressed features" doc: | - TSV file with not filtered differentially expressed features + TSV file with not filtered differentially + expressed features 'sd:visualPlugins': - syncfusiongrid: tab: 'DE features' @@ -285,15 +286,16 @@ outputs: outputSource: deseq_multi_factor/read_counts_gct label: "GCT file with normalized, optionally batch corrected, read counts" doc: | - GCT file with normalized, optionally batch corrected, read counts + GCT file with normalized, optionally batch + corrected, read counts mds_plot_html: type: File? outputSource: deseq_multi_factor/mds_plot_html - label: "MDS plot of normalized counts" + label: "MDS plot of normalized, optionally batch corrected, read counts" doc: | - MDS plot of normalized counts. Optionally batch corrected - based on the --remove value. + MDS plot of normalized, optionally batch + corrected, read counts. HTML format 'sd:visualPlugins': - linkList: @@ -305,7 +307,8 @@ outputs: outputSource: deseq_multi_factor/volcano_plot_png label: "Volcano plot of differentially expressed features" doc: | - Volcano plot of differentially expressed features. + Volcano plot of differentially expressed + features. PNG format 'sd:visualPlugins': - image: @@ -315,15 +318,17 @@ outputs: pca_plot_png: type: File? outputSource: deseq_multi_factor/pca_plot_png - label: "PCA plot of normalized counts based on the top 500 features with the highest row variance" + label: "PCA plot of normalized, optionally batch corrected, read counts" doc: | - PCA plot of normalized counts based on the top 500 - features selected by the highest row variance + PCA plot of normalized, optionally batch + corrected, read counts based on the top + 500 features selected by the highest row + variance PNG format 'sd:visualPlugins': - image: tab: 'Plots' - Caption: 'PCA plot of normalized counts based on the top 500 features with the highest row variance' + Caption: 'PCA plot of normalized, optionally batch corrected, read counts' volcano_plot_html_file: type: File @@ -372,13 +377,13 @@ outputs: tab: 'Overview' target: "_blank" - deseq_stdout_log_file: + deseq_stdout_log: type: File outputSource: deseq_multi_factor/stdout_log label: "DESeq stdout log" doc: "DESeq stdout log" - deseq_stderr_log_file: + deseq_stderr_log: type: File outputSource: deseq_multi_factor/stderr_log label: "DESeq stderr log" @@ -437,12 +442,15 @@ steps: valueFrom: $(self=="none"?null:self) row_distance: row_distance column_distance: column_distance - center_row: center_row + center_row: + default: true selected_features: source: selected_features valueFrom: $(split_by_common_delim(self)) maximum_padj: maximum_padj - threads: threads + threads: + source: threads + valueFrom: $(parseInt(self)) out: - diff_expr_features - read_counts_gct diff --git a/workflows/filter-deseq-for-heatmap.cwl b/workflows/filter-deseq-for-heatmap.cwl index f105dd9b..fbb9d281 100644 --- a/workflows/filter-deseq-for-heatmap.cwl +++ b/workflows/filter-deseq-for-heatmap.cwl @@ -11,7 +11,8 @@ requirements: 'sd:upstream': sample_to_filter: - - "deseq.cwl" + - "deseq.cwl" + - "deseq-multi-factor.cwl" inputs: @@ -36,8 +37,8 @@ inputs: doc: "Filtering parameters (WHERE parameters for SQL query)" 'sd:filtering': params: - columns: ["RefseqId", "GeneId", "Chrom", "TxStart", "TxEnd", "Strand", "RpkmCondition1", "RpkmCondition2", "baseMean", "log2FoldChange", "pvalue", "padj"] - types: ["string", "string", "string", "number", "number", "string", "number", "number", "number", "number", "number", "number"] + columns: ["feature", "RefseqId", "GeneId", "Chrom", "TxStart", "TxEnd", "Strand", "RpkmCondition1", "RpkmCondition2", "baseMean", "log2FoldChange", "pvalue", "padj", "HCL", "HCL.1", "HCL.2", "HCL.3"] + types: ["string", "string", "string", "string", "number", "number", "string", "number", "number", "number", "number", "number", "number", "string", "string", "string", "string"] header: type: boolean? From 22c4488c7734babfce7fb8dd52c257e6ed73fafe Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 12 Sep 2023 12:20:34 -0400 Subject: [PATCH 072/162] Update DESeq to use Combat-Seq instead of Limma --- tools/deseq-multi-factor.cwl | 34 ++++++----- workflows/deseq-multi-factor.cwl | 82 ++++++++++++++------------ workflows/filter-deseq-for-heatmap.cwl | 7 ++- 3 files changed, 67 insertions(+), 56 deletions(-) diff --git a/tools/deseq-multi-factor.cwl b/tools/deseq-multi-factor.cwl index d4022eb1..26c39da8 100644 --- a/tools/deseq-multi-factor.cwl +++ b/tools/deseq-multi-factor.cwl @@ -8,7 +8,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/deseq:v0.0.4 + dockerPull: biowardrobe2/deseq:v0.0.5 inputs: @@ -133,11 +133,11 @@ inputs: inputBinding: prefix: "--remove" doc: | - Column from the metadata file to remove batch effect when - exporting feature counts. All components that include this - term will be removed from the design formula when correcting - for batch effect. Default: do not remove batch effect from - the exported counts + Column from the metadata file to remove batch effect + before running differential expression analysis. If + present, all components that include this term will be + removed from the design and reduced formulas. + Default: do not remove batch effect cluster_method: type: @@ -276,8 +276,9 @@ outputs: outputBinding: glob: "*_pca_plot.png" doc: | - PCA plot of normalized counts based on the top 500 - features selected by the highest row variance + PCA plot of normalized, optionally batch corrected, + read counts based on the top 500 features selected + by the highest row variance PNG format pca_plot_pdf: @@ -285,8 +286,9 @@ outputs: outputBinding: glob: "*_pca_plot.pdf" doc: | - PCA plot of normalized counts based on the top 500 - features selected by the highest row variance + PCA plot of normalized, optionally batch corrected, + read counts based on the top 500 features selected + by the highest row variance PDF format mds_plot_html: @@ -294,8 +296,8 @@ outputs: outputBinding: glob: "*_mds_plot.html" doc: | - MDS plot of normalized counts. Optionally batch corrected - based on the --remove value. + MDS plot of normalized, optionally batch corrected, + read counts HTML format stdout_log: @@ -422,10 +424,10 @@ s:about: | 30), when there is a wide range of sequencing depth across samples. Default: vst --remove REMOVE Column from the metadata file to remove batch effect - when exporting feature counts. All components that - include this term will be removed from the design - formula when correcting for batch effect. Default: do - not remove batch effect from the exported counts + before running differential expression analysis. If + present, all components that include this term will be + removed from the design and reduced formulas. + Default: do not remove batch effect --cluster {row,column,both} Hopach clustering method to be run on normalized read counts for the exploratory visualization analysis. diff --git a/workflows/deseq-multi-factor.cwl b/workflows/deseq-multi-factor.cwl index 7365552c..03d74fcf 100644 --- a/workflows/deseq-multi-factor.cwl +++ b/workflows/deseq-multi-factor.cwl @@ -114,13 +114,13 @@ inputs: remove: type: string? - label: "Column from the metadata file to remove batch effect when exporting feature counts" + label: "Column from the metadata file to remove batch effect" doc: | - Column from the metadata file to remove batch effect when - exporting feature counts. All components that include this - term will be removed from the design formula when correcting - for batch effect. Default: do not remove batch effect from - the exported counts + Column from the metadata file to remove batch effect + before running differential expression analysis. If + present, all components that include this term will be + removed from the design and reduced formulas. + Default: do not remove batch effect base: type: string? @@ -151,25 +151,14 @@ inputs: symbols: - "vst" - "rlog" - default: "vst" + default: "rlog" label: "Read counts normalization for the exploratory visualization analysis" doc: | Read counts normalization for the exploratory visualization analysis. Use 'vst' for medium-to-large datasets (n > 30) and 'rlog' for small datasets (n < 30), when there is a wide range of sequencing depth across samples. - Default: vst - 'sd:layout': - advanced: true - - center_row: - type: boolean? - default: false - label: "Apply mean centering for feature expression prior to running clustering by row" - doc: | - Apply mean centering for feature expression prior to running - clustering by row. Ignored when --cluster is not row or both. - Default: do not centered + Default: rlog 'sd:layout': advanced: true @@ -258,22 +247,34 @@ inputs: advanced: true threads: - type: int? - default: 1 - label: "Number of cores/cpus to use" - doc: "Number of cores/cpus to use. Default: 1" - 'sd:layout': + type: + - "null" + - type: enum + symbols: + - "1" + - "2" + - "3" + - "4" + default: "1" + label: "Cores/CPUs" + doc: | + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 1 + "sd:layout": advanced: true outputs: - diff_expr_features: + diff_expr_file: type: File outputSource: deseq_multi_factor/diff_expr_features label: "TSV file with not filtered differentially expressed features" doc: | - TSV file with not filtered differentially expressed features + TSV file with not filtered differentially + expressed features 'sd:visualPlugins': - syncfusiongrid: tab: 'DE features' @@ -284,15 +285,16 @@ outputs: outputSource: deseq_multi_factor/read_counts_gct label: "GCT file with normalized, optionally batch corrected, read counts" doc: | - GCT file with normalized, optionally batch corrected, read counts + GCT file with normalized, optionally batch + corrected, read counts mds_plot_html: type: File? outputSource: deseq_multi_factor/mds_plot_html - label: "MDS plot of normalized counts" + label: "MDS plot of normalized, optionally batch corrected, read counts" doc: | - MDS plot of normalized counts. Optionally batch corrected - based on the --remove value. + MDS plot of normalized, optionally batch + corrected, read counts. HTML format 'sd:visualPlugins': - linkList: @@ -304,7 +306,8 @@ outputs: outputSource: deseq_multi_factor/volcano_plot_png label: "Volcano plot of differentially expressed features" doc: | - Volcano plot of differentially expressed features. + Volcano plot of differentially expressed + features. PNG format 'sd:visualPlugins': - image: @@ -314,15 +317,17 @@ outputs: pca_plot_png: type: File? outputSource: deseq_multi_factor/pca_plot_png - label: "PCA plot of normalized counts based on the top 500 features with the highest row variance" + label: "PCA plot of normalized, optionally batch corrected, read counts" doc: | - PCA plot of normalized counts based on the top 500 - features selected by the highest row variance + PCA plot of normalized, optionally batch + corrected, read counts based on the top + 500 features selected by the highest row + variance PNG format 'sd:visualPlugins': - image: tab: 'Plots' - Caption: 'PCA plot of normalized counts based on the top 500 features with the highest row variance' + Caption: 'PCA plot of normalized, optionally batch corrected, read counts' volcano_plot_html_file: type: File @@ -436,12 +441,15 @@ steps: valueFrom: $(self=="none"?null:self) row_distance: row_distance column_distance: column_distance - center_row: center_row + center_row: + default: true selected_features: source: selected_features valueFrom: $(split_by_common_delim(self)) maximum_padj: maximum_padj - threads: threads + threads: + source: threads + valueFrom: $(parseInt(self)) out: - diff_expr_features - read_counts_gct diff --git a/workflows/filter-deseq-for-heatmap.cwl b/workflows/filter-deseq-for-heatmap.cwl index f105dd9b..fbb9d281 100644 --- a/workflows/filter-deseq-for-heatmap.cwl +++ b/workflows/filter-deseq-for-heatmap.cwl @@ -11,7 +11,8 @@ requirements: 'sd:upstream': sample_to_filter: - - "deseq.cwl" + - "deseq.cwl" + - "deseq-multi-factor.cwl" inputs: @@ -36,8 +37,8 @@ inputs: doc: "Filtering parameters (WHERE parameters for SQL query)" 'sd:filtering': params: - columns: ["RefseqId", "GeneId", "Chrom", "TxStart", "TxEnd", "Strand", "RpkmCondition1", "RpkmCondition2", "baseMean", "log2FoldChange", "pvalue", "padj"] - types: ["string", "string", "string", "number", "number", "string", "number", "number", "number", "number", "number", "number"] + columns: ["feature", "RefseqId", "GeneId", "Chrom", "TxStart", "TxEnd", "Strand", "RpkmCondition1", "RpkmCondition2", "baseMean", "log2FoldChange", "pvalue", "padj", "HCL", "HCL.1", "HCL.2", "HCL.3"] + types: ["string", "string", "string", "string", "number", "number", "string", "number", "number", "number", "number", "number", "number", "string", "string", "string", "string"] header: type: boolean? From 301cd888f3e02203a9d916d1118eb4b8b86de8b1 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 14 Sep 2023 15:51:34 -0400 Subject: [PATCH 073/162] Replace rmdup with markdup, update collected statistics report --- tools/collect-statistics-chip-seq.cwl | 53 ++++++-- tools/samtools-filter.cwl | 40 +++--- tools/samtools-markdup.cwl | 184 ++++++++++++++++++++++++++ workflows/chipseq-pe.cwl | 157 +++++++++++----------- workflows/chipseq-se.cwl | 150 ++++++++++----------- workflows/trim-atacseq-pe.cwl | 155 +++++++++++----------- workflows/trim-atacseq-se.cwl | 146 ++++++++++---------- workflows/trim-chipseq-pe.cwl | 137 +++++++++---------- workflows/trim-chipseq-se.cwl | 128 ++++++++---------- 9 files changed, 666 insertions(+), 484 deletions(-) create mode 100644 tools/samtools-markdup.cwl diff --git a/tools/collect-statistics-chip-seq.cwl b/tools/collect-statistics-chip-seq.cwl index 21890927..1074d859 100644 --- a/tools/collect-statistics-chip-seq.cwl +++ b/tools/collect-statistics-chip-seq.cwl @@ -18,7 +18,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: rackspacedot/python37 + dockerPull: biowardrobe2/sc-tools:v0.0.29 inputs: @@ -26,19 +26,20 @@ inputs: script: type: string? default: | - #!/usr/bin/env python + #!/usr/bin/env python3 import os import sys import argparse + import pandas import yaml import math def cut_int(s): - return int(s.strip().split()[0]) + return int(str(s).strip().split()[0]) def cut_float(s): - return float(s.strip().split()[0]) + return float(str(s).strip().split()[0]) TRIMGALORE = { @@ -149,6 +150,11 @@ inputs: "function": int, "pair_end_specific": True }, + "reads duplicated": { + "alias": "reads/pairs duplicated", + "function": cut_int, + "pair_end_specific": True + }, "average length": { "alias": "reads average length", "function": float, @@ -177,6 +183,7 @@ inputs: "order": ["total reads/pairs", "reads/pairs mapped", "reads/pairs unmapped", + "reads/pairs duplicated", "reads average length", "reads maximum length", "reads average quality", @@ -228,6 +235,7 @@ inputs: general_parser.add_argument("--bamstats", help="Path to bam statistics report file", required=True) general_parser.add_argument("--bamstatsfilter", help="Path to bam statistics report file after filtering", required=True) general_parser.add_argument("--macs2", help="Path to MACS2 called peaks xls file", required=True) + general_parser.add_argument("--atdp", help="Path to ATDP output TSV file", required=True) general_parser.add_argument("--preseq", help="Path to Preseq output file", required=False) general_parser.add_argument("--paired", help="Process as paired-end. Default: False", action="store_true") general_parser.add_argument("--output", help="Output filename prefix", required=True) @@ -297,7 +305,7 @@ inputs: res_key, res_function, pair_end_specific = get_correspondent_key(key_dict, key) if not collected_results[header].get(res_key, None): if pair_end_specific and pair_end: - collected_results[header][res_key] = res_function(res_function(value)/2) + collected_results[header][res_key] = res_function(int(res_function(value)/2)) else: collected_results[header][res_key] = res_function(value) except Exception: @@ -331,6 +339,12 @@ inputs: collected_results[header] = {k: collected_results[header][k] for k in MACS2["order"] if k in collected_results[header]} + def process_atdp_results(filepath, collected_results, header): + if not collected_results.get(header, None): + collected_results[header] = {} + collected_results[header]["maximum"] = str(pandas.read_csv(filepath, sep="\t")["Y"].max()) + + def process_preseq_results(filepath, collected_results, header, threashold=0.001): px, py = 0, 0 for line in open_file(filepath): @@ -357,6 +371,7 @@ inputs: process_custom_report(args.bamstatsfilter, collected_results, "BAM statistics after filtering", BAMSTATS, bool(args.paired)) process_custom_report(args.macs2, collected_results, "peak calling statistics", MACS2) process_macs2_xls(args.macs2, collected_results, "peak calling statistics") + process_atdp_results(args.atdp, collected_results, "average tag density") if args.preseq: process_preseq_results(args.preseq, collected_results, "library preparation") return (collected_results) @@ -406,6 +421,7 @@ inputs: "total reads/pairs", "reads/pairs mapped", "reads/pairs unmapped", + "reads/pairs duplicated", "insert size average", "insert size standard deviation", "reads average length", @@ -416,6 +432,7 @@ inputs: "total reads/pairs", "reads/pairs mapped", "reads/pairs unmapped", + "reads/pairs duplicated", "insert size average", "insert size standard deviation", "reads average length", @@ -428,7 +445,10 @@ inputs: "total reads/pairs in treatment", "reads/pairs after filtering in treatment", "redundant rate in treatment", - "fraction of reads in peaks"] + "fraction of reads in peaks", + + "average tag density", + "maximum"] if collected_data.get("adapter trimming statistics", None): header.extend(["adapter trimming statistics", @@ -464,6 +484,7 @@ inputs: collected_data["BAM statistics"]["total reads/pairs"], collected_data["BAM statistics"]["reads/pairs mapped"], collected_data["BAM statistics"]["reads/pairs unmapped"], + collected_data["BAM statistics"]["reads/pairs duplicated"], collected_data["BAM statistics"]["insert size average"], collected_data["BAM statistics"]["insert size standard deviation"], collected_data["BAM statistics"]["reads average length"], @@ -474,6 +495,7 @@ inputs: collected_data["BAM statistics after filtering"]["total reads/pairs"], collected_data["BAM statistics after filtering"]["reads/pairs mapped"], collected_data["BAM statistics after filtering"]["reads/pairs unmapped"], + collected_data["BAM statistics after filtering"]["reads/pairs duplicated"], collected_data["BAM statistics after filtering"]["insert size average"], collected_data["BAM statistics after filtering"]["insert size standard deviation"], collected_data["BAM statistics after filtering"]["reads average length"], @@ -486,7 +508,10 @@ inputs: collected_data["peak calling statistics"]["total reads/pairs in treatment"], collected_data["peak calling statistics"]["reads/pairs after filtering in treatment"], collected_data["peak calling statistics"]["redundant rate in treatment"], - collected_data["peak calling statistics"]["fraction of reads in peaks"]] + collected_data["peak calling statistics"]["fraction of reads in peaks"], + + "", + collected_data["average tag density"]["maximum"]] if collected_data.get("adapter trimming statistics", None): data.extend(["", @@ -559,22 +584,28 @@ inputs: position: 11 prefix: "--macs2" + atdp_results: + type: File + inputBinding: + position: 12 + prefix: "--atdp" + preseq_results: type: File? inputBinding: - position: 12 + position: 13 prefix: "--preseq" paired_end: type: boolean? inputBinding: - position: 13 + position: 14 prefix: "--paired" output_prefix: type: string? inputBinding: - position: 14 + position: 15 prefix: "--output" valueFrom: $(get_output_prefix()) default: "" @@ -605,7 +636,7 @@ outputs: outputEval: $(parseInt(self[0].contents.split('\n')[1].split('\t')[1])) -baseCommand: [python, '-c'] +baseCommand: [python3, '-c'] $namespaces: diff --git a/tools/samtools-filter.cwl b/tools/samtools-filter.cwl index 531a4fe4..f1d3012c 100644 --- a/tools/samtools-filter.cwl +++ b/tools/samtools-filter.cwl @@ -3,11 +3,8 @@ class: CommandLineTool requirements: - class: InlineJavascriptRequirement - - -hints: - class: DockerRequirement - dockerPull: biowardrobe2/samtools:v1.4 + dockerPull: biowardrobe2/samtools:v1.11 inputs: @@ -16,15 +13,20 @@ inputs: type: string? default: | #!/bin/bash + echo "Copy $0 to temp.bam" + cp $0 temp.bam + samtools sort temp.bam -o temp_sorted.bam + samtools index temp_sorted.bam echo "Filtering BAM file" - echo "samtools idxstats $0 | cut -f 1 | grep -v -E \"`echo $1 | sed -e 's/ /$|/g'`$|\*\" | xargs samtools view -q $2 -o $3 $0" - samtools idxstats $0 | cut -f 1 | grep -v -E "`echo $1 | sed -e 's/ /$|/g'`$|\*" | xargs samtools view -q $2 -o $3 $0 + echo "samtools idxstats temp_sorted.bam | cut -f 1 | grep -v -E \"`echo $1 | sed -e 's/ /$|/g'`$|\*\" | xargs samtools view -q $2 -F $3 -o temp_filtered.bam temp_sorted.bam" + samtools idxstats temp_sorted.bam | cut -f 1 | grep -v -E "`echo $1 | sed -e 's/ /$|/g'`$|\*" | xargs samtools view -q $2 -F $3 -o temp_filtered.bam temp_sorted.bam echo "Sorting BAM file" - echo "samtools sort $3 -o $3" - samtools sort $3 -o $3 + echo "samtools sort temp_filtered.bam -o $4" + samtools sort temp_filtered.bam -o $4 echo "Indexing BAM file" - echo "samtools index $3" - samtools index $3 + echo "samtools index $4" + samtools index $4 + rm -f temp* inputBinding: position: 5 doc: "Script to exclude chromosomes from the BAM file and filter reads by quality" @@ -49,11 +51,18 @@ inputs: position: 8 default: 0 doc: "Skip alignments with MAPQ smaller than INT. Default 0" - + + negative_flag: + type: int? + inputBinding: + position: 9 + default: 0 + doc: "Do not output alignments with any bits set in INT present in the FLAG field. Default 0" + output_filename: type: string? inputBinding: - position: 9 + position: 10 valueFrom: | ${ return (self == "")?inputs.bam_bai_pair.basename:self; @@ -121,8 +130,9 @@ s:creator: - id: http://orcid.org/0000-0002-6486-3898 doc: | - Excludes chromosomes from the input BAM file. Filters reads by quality. - If there is only one chromosome present, you cannot exclude it + Excludes chromosomes from the input BAM file. + Optionally filters reads by quality and flags s:about: | - Excludes chromosomes from the input BAM file \ No newline at end of file + Excludes chromosomes from the input BAM file. + Optionally filters reads by quality and flags \ No newline at end of file diff --git a/tools/samtools-markdup.cwl b/tools/samtools-markdup.cwl new file mode 100644 index 00000000..b777f859 --- /dev/null +++ b/tools/samtools-markdup.cwl @@ -0,0 +1,184 @@ +cwlVersion: v1.0 +class: CommandLineTool + +requirements: +- class: InlineJavascriptRequirement +- class: InitialWorkDirRequirement + listing: | + ${ + return [ + { + "entry": inputs.bam_bai_pair, + "entryname": inputs.bam_bai_pair.basename, + "writable": true + } + ] + } +- class: DockerRequirement + dockerPull: biowardrobe2/samtools:v1.11 + + +inputs: + + script: + type: string? + default: | + + #!/bin/bash + echo "Rename $0 to temp.bam" + mv $0 temp.bam + if [ -f $0.bai ]; then + echo "Rename $0.bai to temp.bam.bai" + mv $0.bai temp.bam.bai + fi + + echo "Sorting BAM file by name" + echo "samtools sort -n -@ $3 -o namesorted.bam temp.bam" + samtools sort -n -@ $3 -o namesorted.bam temp.bam + + echo "Filling in mate coordinates and inserting size fields" + echo "samtools fixmate -m -@ $3 namesorted.bam fixed.bam" + samtools fixmate -m -@ $3 namesorted.bam fixed.bam + + echo "Sorting BAM file by coordinates" + echo "samtools sort -@ $3 -o positionsorted.bam fixed.bam" + samtools sort -@ $3 -o positionsorted.bam fixed.bam + + if [ "$1" = "true" ] + then + echo "Only marking PCR duplicates" + echo "samtools markdup -c -s -@ $3 positionsorted.bam markduped.bam" + samtools markdup -c -s -@ $3 positionsorted.bam markduped.bam 2> markdup_report.tsv + else + echo "Removing PCR duplicates" + echo "samtools markdup -c -r -s -@ $3 positionsorted.bam markduped.bam" + samtools markdup -c -r -s -@ $3 positionsorted.bam markduped.bam 2> markdup_report.tsv + fi + + echo "Sorting BAM file" + echo "samtools sort -@ $3 markduped.bam -o $2" + samtools sort -@ $3 markduped.bam -o $2 + + echo "Indexing BAM file" + echo "samtools index $2" + samtools index $2 + + echo "Removing temporary files" + rm -f namesorted.bam fixed.bam positionsorted.bam markduped.bam temp.bam* + + inputBinding: + position: 5 + doc: "Script to remove PCR duplicates" + + bam_bai_pair: + type: File + inputBinding: + position: 6 + doc: BAM (optionally BAI) files + + keep_duplicates: + type: boolean? + default: false # somehow when omitted, valueFrom is not evaluated + inputBinding: + position: 7 + valueFrom: $(self?"true":"false") + doc: | + If true duplicates will be only + marked, oterwise - removed + + output_filename: + type: string? + inputBinding: + position: 8 + valueFrom: | + ${ + return (self == "")?inputs.bam_bai_pair.basename:self; + } + default: "" + doc: "Output filename for the filtered BAM file" + + threads: + type: int? + inputBinding: + position: 9 + default: 1 + doc: "Number of threads to use" + + +outputs: + + deduplicated_bam_bai_pair: + type: File + outputBinding: + glob: "*.bam" + secondaryFiles: + - .bai + doc: "BAM+BAI files with PCR duplicates removed" + + markdup_report: + type: File + outputBinding: + glob: "markdup_report.tsv" + doc: "Markdup report" + + +baseCommand: [bash, '-c'] + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +s:mainEntity: + $import: ./metadata/samtools-metadata.yaml + +s:name: "samtools-markdup" +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/samtools-markdup.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + +doc: | + Removes or only marks PCR duplicates from + coordinate sorted and indexed BAM file. + Returns coordinate sorted and indexed BAM + files. Stages input bam_bai_pair to workdir. + Otherwise samtools sort fails. + +s:about: | + Removes or only marks PCR duplicates from + coordinate sorted and indexed BAM file. + Returns coordinate sorted and indexed BAM + files. Stages input bam_bai_pair to workdir. + Otherwise samtools sort fails. \ No newline at end of file diff --git a/workflows/chipseq-pe.cwl b/workflows/chipseq-pe.cwl index 86d01f12..1c5c2e65 100644 --- a/workflows/chipseq-pe.cwl +++ b/workflows/chipseq-pe.cwl @@ -64,14 +64,20 @@ inputs: doc: "Set to call broad peak for MACS2" fastq_file_upstream: - type: File - label: "FASTQ 1 input file" + type: + - File + - type: array + items: File + label: "FASTQ 1 input file(s)" format: "http://edamontology.org/format_1930" doc: "Reads data in a FASTQ format, received after paired end sequencing" fastq_file_downstream: - type: File - label: "FASTQ 2 input file" + type: + - File + - type: array + items: File + label: "FASTQ 2 input file(s)" format: "http://edamontology.org/format_1930" doc: "Reads data in a FASTQ format, received after paired end sequencing" @@ -115,6 +121,16 @@ inputs: label: "Remove duplicates" doc: "Calls samtools rmdup to remove duplicates from sortesd BAM file" + peak_calling_fdr: + type: float? + default: 0.05 + 'sd:layout': + advanced: true + label: "Minimum FDR (q-value) cutoff for peak detection" + doc: | + Minimum FDR (q-value) cutoff for peak detection. -q, and + -p are mutually exclusive. + promoter_dist: type: int? default: 1000 @@ -266,19 +282,12 @@ outputs: data: [$1, $2] comparable: "atdp" - samtools_rmdup_log: - type: File - label: "Remove duplicates log" - format: "http://edamontology.org/format_2330" - doc: "Samtools rmdup generated log" - outputSource: samtools_rmdup/rmdup_log - bambai_pair: type: File format: "http://edamontology.org/format_2572" label: "Coordinate sorted BAM alignment file (+index BAI)" doc: "Coordinate sorted BAM file and BAI index file" - outputSource: samtools_sort_index_after_rmdup/bam_bai_pair + outputSource: samtools_remove_duplicates/deduplicated_bam_bai_pair 'sd:visualPlugins': - igvbrowser: tab: 'IGV Genome Browser' @@ -308,8 +317,7 @@ outputs: id: 'igvbrowser' type: 'annotation' name: "Narrow peaks" - displayMode: "COLLAPSE" - height: 40 + height: 120 macs2_broad_peaks: type: File? @@ -323,8 +331,7 @@ outputs: id: 'igvbrowser' type: 'annotation' name: "Broad peaks" - displayMode: "COLLAPSE" - height: 40 + height: 120 macs2_peak_summits: type: File? @@ -352,8 +359,7 @@ outputs: id: 'igvbrowser' type: 'annotation' name: "Gapped peaks" - displayMode: "COLLAPSE" - height: 40 + height: 120 macs2_log: type: File? @@ -433,21 +439,22 @@ outputs: doc: "fragment, calculated fragment, islands count from MACS2 results" outputSource: macs2_callpeak/macs2_stat_file - preseq_estimates_plot_data: + preseq_estimates: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq_plot_data/estimates_file_plot_data + outputSource: preseq/estimates_file 'sd:visualPlugins': - - line: + - scatter: tab: 'QC Plots' - Title: 'Distinct Read Counts Estimates' - xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' - yAxisTitle: 'Distinct Reads Count' - colors: ["#4b78a3", "#a3514b"] + Title: 'Preseq Estimates' + xAxisTitle: 'Total reads count' + yAxisTitle: 'Expected distinct reads count' + colors: ["#4b78a3"] height: 500 - data: [$2, $5] + data: [$1, $2] + comparable: "preseq" estimated_fragment_size: type: int @@ -472,9 +479,9 @@ steps: the core facility by providing a URL or from GEO by providing SRA accession number. run: ../tools/extract-fastq.cwl in: - output_prefix: - default: "read_1" compressed_file: fastq_file_upstream + output_prefix: + default: "read_1" out: [fastq_file] extract_fastq_downstream: @@ -485,9 +492,9 @@ steps: the core facility by providing a URL or from GEO by providing SRA accession number. run: ../tools/extract-fastq.cwl in: - output_prefix: - default: "read_2" compressed_file: fastq_file_downstream + output_prefix: + default: "read_2" out: [fastq_file] fastx_quality_stats_upstream: @@ -515,8 +522,11 @@ steps: bowtie_aligner: label: "Alignment to reference genome" doc: | - Aligns reads to the reference genome keeping only uniquely mapped reads with - less than 3 mismatches. + Aligns reads to the reference genome. + Reads are assumed to be mapped if they + have less than 3 mismatches. + sam_file output includes both mapped + and unmapped reads. run: ../tools/bowtie-alignreads.cwl in: upstream_filelist: extract_fastq_upstream/fastq_file @@ -552,11 +562,14 @@ steps: threads: threads out: [bam_bai_pair] - clean_sam_headers_for_preseq: - run: ../tools/samtools-clean-headers.cwl + samtools_mark_duplicates: + run: ../tools/samtools-markdup.cwl in: - bam_file: samtools_sort_index/bam_bai_pair - out: [preseq_bam] + bam_bai_pair: samtools_sort_index/bam_bai_pair + keep_duplicates: + default: true + threads: threads + out: [deduplicated_bam_bai_pair] preseq: label: "Sequencing depth estimation" @@ -565,32 +578,22 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: clean_sam_headers_for_preseq/preseq_bam + bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair pe_mode: default: true extrapolation: default: 1000000000 - out: [estimates_file, log_file_stdout, log_file_stderr] - - samtools_rmdup: - label: "PCR duplicates removal" - doc: | - Removes potential PCR duplicates. This step is used to remove reads overamplified - in PCR. Unfortunately, it may also remove "good" reads. We do not recommend to - remove duplicates unless the library is heavily duplicated. - run: ../tools/samtools-rmdup.cwl - in: - trigger: remove_duplicates - bam_file: samtools_sort_index/bam_bai_pair - out: [rmdup_output, rmdup_log] + out: [estimates_file] - samtools_sort_index_after_rmdup: - run: ../tools/samtools-sort-index.cwl + samtools_remove_duplicates: + run: ../tools/samtools-markdup.cwl in: - trigger: remove_duplicates - sort_input: samtools_rmdup/rmdup_output + bam_bai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair + keep_duplicates: + source: remove_duplicates + valueFrom: $(!self) threads: threads - out: [bam_bai_pair] + out: [deduplicated_bam_bai_pair] macs2_callpeak: label: "Peak detection" @@ -599,7 +602,7 @@ steps: transcription factor binding sites. run: ../tools/macs2-callpeak-biowardrobe-only.cwl in: - treatment_file: samtools_sort_index_after_rmdup/bam_bai_pair + treatment_file: samtools_remove_duplicates/deduplicated_bam_bai_pair control_file: control_file nolambda: source: control_file @@ -618,8 +621,7 @@ steps: valueFrom: $(!self) keep_dup: default: auto - q_value: - default: 0.05 + q_value: peak_calling_fdr format_mode: default: BAMPE buffer_size: @@ -640,7 +642,7 @@ steps: bam_to_bigwig: run: ../tools/bam-bedgraph-bigwig.cwl in: - bam_file: samtools_sort_index_after_rmdup/bam_bai_pair + bam_file: samtools_remove_duplicates/deduplicated_bam_bai_pair chrom_length_file: chrom_length mapped_reads_number: get_stat/mapped_reads pairchip: @@ -654,20 +656,20 @@ steps: read length and quality score, etc. run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index/bam_bai_pair + bambai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index/bam_bai_pair + source: samtools_mark_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") out: [log_file] get_bam_statistics_after_filtering: run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index_after_rmdup/bam_bai_pair + bambai_pair: samtools_remove_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index_after_rmdup/bam_bai_pair + source: samtools_remove_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report_after_filtering.txt") - out: [log_file, ext_is_section] + out: [log_file, ext_is_section, reads_mapped] get_stat: run: ../tools/collect-statistics-chip-seq.cwl @@ -676,23 +678,12 @@ steps: bam_statistics_report: get_bam_statistics/log_file bam_statistics_after_filtering_report: get_bam_statistics_after_filtering/log_file macs2_called_peaks: macs2_callpeak/peak_xls_file + atdp_results: average_tag_density/result_file preseq_results: preseq/estimates_file paired_end: default: True out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] - preseq_plot_data: - label: "Formats sequencing depth estimation data for plotting" - doc: | - Formats estimates file from preseq standard output for QC plotting. This adds a new - column that includes the actual read count point on the plot. - run: ../tools/preseq-plot-data.cwl - in: - preseq_stderr_log_file: preseq/log_file_stderr - estimates_file: preseq/estimates_file - mapped_reads: get_stat/mapped_reads - out: [estimates_file_plot_data] - island_intersect: label: "Peak annotation" doc: | @@ -713,7 +704,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_sort_index_after_rmdup/bam_bai_pair + input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: @@ -726,7 +717,9 @@ steps: default: "chrX chrY" avd_heat_window_bp: default: 200 - mapped_reads: get_stat/mapped_reads + mapped_reads: + source: get_bam_statistics_after_filtering/reads_mapped + valueFrom: $(parseInt(self/2)) out: [result_file, log_file] @@ -736,8 +729,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Deprecated. ChIP-Seq pipeline paired-end" -s:name: "Deprecated. ChIP-Seq pipeline paired-end" +label: "ChIP-Seq pipeline paired-end" +s:name: "ChIP-Seq pipeline paired-end" s:alternateName: "ChIP-Seq basic analysis workflow for a paired-end experiment" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/chipseq-pe.cwl @@ -797,9 +790,7 @@ doc: | *samtools\_sort\_index*. Depending on workflow’s input parameters indexed and sorted BAM file - can be processed by `samtools rmdup` *samtools\_rmdup* to get rid of duplicated reads. - If removing duplicates is not required the original BAM and BAI - files are returned. Otherwise step *samtools\_sort\_index\_after\_rmdup* repeat `samtools sort` and `samtools index` with BAM and BAI files without duplicates. + can be processed by `samtools markdup` *samtools\_remove\_duplicates* to get rid of duplicated reads. Next `macs2 callpeak` performs peak calling *macs2\_callpeak* and the next step reports *macs2\_island\_count* the number of islands and estimated fragment size. If the latter diff --git a/workflows/chipseq-se.cwl b/workflows/chipseq-se.cwl index d3409eaa..466f2aaf 100644 --- a/workflows/chipseq-se.cwl +++ b/workflows/chipseq-se.cwl @@ -66,8 +66,11 @@ inputs: doc: "Make MACS2 call broad peaks by linking nearby highly enriched regions" fastq_file: - type: File - label: "FASTQ file" + type: + - File + - type: array + items: File + label: "FASTQ file(s)" format: "http://edamontology.org/format_1930" doc: "Single-read sequencing data in FASTQ format (fastq, fq, bzip2, gzip, zip)" @@ -111,6 +114,16 @@ inputs: label: "Remove PCR duplicates" doc: "Remove PCR duplicates from sorted BAM file" + peak_calling_fdr: + type: float? + default: 0.05 + 'sd:layout': + advanced: true + label: "Minimum FDR (q-value) cutoff for peak detection" + doc: | + Minimum FDR (q-value) cutoff for peak detection. -q, and + -p are mutually exclusive. + promoter_dist: type: int? default: 1000 @@ -233,7 +246,7 @@ outputs: format: "http://edamontology.org/format_2572" label: "Aligned reads" doc: "Coordinate sorted BAM alignment and index BAI files" - outputSource: samtools_sort_index_after_rmdup/bam_bai_pair + outputSource: samtools_remove_duplicates/deduplicated_bam_bai_pair 'sd:visualPlugins': - igvbrowser: tab: 'IGV Genome Browser' @@ -262,8 +275,7 @@ outputs: id: 'igvbrowser' type: 'annotation' name: "Narrow peaks" - displayMode: "COLLAPSE" - height: 40 + height: 120 macs2_broad_peaks: type: File? @@ -277,22 +289,21 @@ outputs: id: 'igvbrowser' type: 'annotation' name: "Broad peaks" - displayMode: "COLLAPSE" - height: 40 + height: 120 workflow_statistics_yaml: type: File? label: "YAML formatted combined log" format: "http://edamontology.org/format_3750" doc: "YAML formatted combined log" - outputSource: get_stat/collected_statistics_yaml + outputSource: get_statistics/collected_statistics_yaml workflow_statistics_markdown: type: File? label: "Markdown formatted combined log" format: "http://edamontology.org/format_3835" doc: "Markdown formatted combined log" - outputSource: get_stat/collected_statistics_md + outputSource: get_statistics/collected_statistics_md 'sd:visualPlugins': - markdownView: tab: 'Overview' @@ -302,7 +313,7 @@ outputs: label: "Workflow execution statistics" format: "http://edamontology.org/format_3475" doc: "Overall workflow execution statistics from bowtie_aligner and samtools_rmdup steps" - outputSource: get_stat/collected_statistics_tsv + outputSource: get_statistics/collected_statistics_tsv 'sd:visualPlugins': - tableView: vertical: true @@ -327,21 +338,22 @@ outputs: doc: "BAM statistics report (after all filters applied)" outputSource: get_bam_statistics_after_filtering/log_file - preseq_estimates_plot_data: + preseq_estimates: type: File? - label: "Preseq estimates" + label: "Expected Distinct Reads Count Plot" format: "http://edamontology.org/format_3475" - doc: "Preseq estimated results" - outputSource: preseq_plot_data/estimates_file_plot_data + doc: "Expected distinct reads count file from Preseq in TSV format" + outputSource: preseq/estimates_file 'sd:visualPlugins': - - line: + - scatter: tab: 'QC Plots' - Title: 'Distinct Read Counts Estimates' - xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' - yAxisTitle: 'Distinct Reads Count' - colors: ["#4b78a3", "#a3514b"] + Title: 'Expected Distinct Reads Count Plot' + xAxisTitle: 'Total reads count' + yAxisTitle: 'Expected distinct reads count' + colors: ["#4b78a3"] height: 500 - data: [$2, $5] + data: [$1, $2] + comparable: "preseq" estimated_fragment_size: type: int @@ -353,7 +365,7 @@ outputs: type: int label: "Mapped reads number" doc: "Mapped reads number for downstream analyses" - outputSource: get_stat/mapped_reads + outputSource: get_statistics/mapped_reads steps: @@ -367,6 +379,8 @@ steps: run: ../tools/extract-fastq.cwl in: compressed_file: fastq_file + output_prefix: + default: "read_1" out: [fastq_file] fastx_quality_stats: @@ -383,8 +397,11 @@ steps: bowtie_aligner: label: "Alignment to reference genome" doc: | - Aligns reads to the reference genome keeping only uniquely mapped reads with - less than 3 mismatches. + Aligns reads to the reference genome. + Reads are assumed to be mapped if they + have less than 3 mismatches. + sam_file output includes both mapped + and unmapped reads. run: ../tools/bowtie-alignreads.cwl in: upstream_filelist: extract_fastq/fastq_file @@ -423,11 +440,14 @@ steps: threads: threads out: [bam_bai_pair] - clean_sam_headers_for_preseq: - run: ../tools/samtools-clean-headers.cwl + samtools_mark_duplicates: + run: ../tools/samtools-markdup.cwl in: - bam_file: samtools_sort_index/bam_bai_pair - out: [preseq_bam] + bam_bai_pair: samtools_sort_index/bam_bai_pair + keep_duplicates: + default: true + threads: threads + out: [deduplicated_bam_bai_pair] preseq: label: "Sequencing depth estimation" @@ -436,34 +456,20 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: clean_sam_headers_for_preseq/preseq_bam + bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair extrapolation: default: 1000000000 - out: [estimates_file, log_file_stdout, log_file_stderr] + out: [estimates_file] - samtools_rmdup: - label: "PCR duplicates removal" - doc: | - Removes potential PCR duplicates. This step is used to remove reads overamplified - in PCR. Unfortunately, it may also remove "good" reads. We do not recommend to - remove duplicates unless the library is heavily duplicated. - run: ../tools/samtools-rmdup.cwl - in: - trigger: remove_duplicates - bam_file: samtools_sort_index/bam_bai_pair - single_end: - default: true - out: - - rmdup_output - - rmdup_log - - samtools_sort_index_after_rmdup: - run: ../tools/samtools-sort-index.cwl + samtools_remove_duplicates: + run: ../tools/samtools-markdup.cwl in: - trigger: remove_duplicates - sort_input: samtools_rmdup/rmdup_output + bam_bai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair + keep_duplicates: + source: remove_duplicates + valueFrom: $(!self) threads: threads - out: [bam_bai_pair] + out: [deduplicated_bam_bai_pair] macs2_callpeak: label: "Peak detection" @@ -472,7 +478,7 @@ steps: transcription factor binding sites. run: ../tools/macs2-callpeak-biowardrobe-only.cwl in: - treatment_file: samtools_sort_index_after_rmdup/bam_bai_pair + treatment_file: samtools_remove_duplicates/deduplicated_bam_bai_pair control_file: control_file nolambda: source: control_file @@ -491,8 +497,7 @@ steps: valueFrom: $(!self) keep_dup: default: auto - q_value: - default: 0.05 + q_value: peak_calling_fdr format_mode: default: BAM buffer_size: @@ -506,9 +511,9 @@ steps: bam_to_bigwig: run: ../tools/bam-bedgraph-bigwig.cwl in: - bam_file: samtools_sort_index_after_rmdup/bam_bai_pair + bam_file: samtools_remove_duplicates/deduplicated_bam_bai_pair chrom_length_file: chrom_length - mapped_reads_number: get_stat/mapped_reads + mapped_reads_number: get_statistics/mapped_reads fragment_size: macs2_callpeak/macs2_fragments_calculated out: [bigwig_file] @@ -519,43 +524,32 @@ steps: read length and quality score, etc. run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index/bam_bai_pair + bambai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index/bam_bai_pair + source: samtools_mark_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") out: [log_file] get_bam_statistics_after_filtering: run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index_after_rmdup/bam_bai_pair + bambai_pair: samtools_remove_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index_after_rmdup/bam_bai_pair + source: samtools_remove_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report_after_filtering.txt") - out: [log_file] + out: [log_file, reads_mapped] - get_stat: + get_statistics: run: ../tools/collect-statistics-chip-seq.cwl in: bowtie_alignment_report: bowtie_aligner/log_file bam_statistics_report: get_bam_statistics/log_file bam_statistics_after_filtering_report: get_bam_statistics_after_filtering/log_file macs2_called_peaks: macs2_callpeak/peak_xls_file + atdp_results: average_tag_density/result_file preseq_results: preseq/estimates_file out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] - preseq_plot_data: - label: "Formats sequencing depth estimation data for plotting" - doc: | - Formats estimates file from preseq standard output for QC plotting. This adds a new - column that includes the actual read count point on the plot. - run: ../tools/preseq-plot-data.cwl - in: - preseq_stderr_log_file: preseq/log_file_stderr - estimates_file: preseq/estimates_file - mapped_reads: get_stat/mapped_reads - out: [estimates_file_plot_data] - island_intersect: label: "Peak annotation" doc: | @@ -576,7 +570,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_sort_index_after_rmdup/bam_bai_pair + input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: @@ -589,7 +583,9 @@ steps: default: "chrX chrY" avd_heat_window_bp: default: 200 - mapped_reads: get_stat/mapped_reads + mapped_reads: + source: get_bam_statistics_after_filtering/reads_mapped + valueFrom: $(parseInt(self)) out: [result_file] @@ -599,8 +595,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Deprecated. ChIP-Seq pipeline single-read" -s:name: "Deprecated. ChIP-Seq pipeline single-read" +label: "ChIP-Seq pipeline single-read" +s:name: "ChIP-Seq pipeline single-read" s:alternateName: "ChIP-Seq basic analysis workflow for single-read data" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/chipseq-se.cwl diff --git a/workflows/trim-atacseq-pe.cwl b/workflows/trim-atacseq-pe.cwl index 73e0ab90..ee59db3d 100644 --- a/workflows/trim-atacseq-pe.cwl +++ b/workflows/trim-atacseq-pe.cwl @@ -68,7 +68,7 @@ inputs: - File - type: array items: File - label: "FASTQ read 1 input file" + label: "FASTQ read 1 input file(s)" format: "http://edamontology.org/format_1930" doc: "Reads data in a FASTQ format, received after paired end sequencing" @@ -77,7 +77,7 @@ inputs: - File - type: array items: File - label: "FASTQ read 2 input file" + label: "FASTQ read 2 input file(s)" format: "http://edamontology.org/format_1930" doc: "Reads data in a FASTQ format, received after paired end sequencing" @@ -121,6 +121,16 @@ inputs: label: "Remove duplicates" doc: "Calls samtools rmdup to remove duplicates from sortesd BAM file" + peak_calling_fdr: + type: float? + default: 0.05 + 'sd:layout': + advanced: true + label: "Minimum FDR (q-value) cutoff for peak detection" + doc: | + Minimum FDR (q-value) cutoff for peak detection. -q, and + -p are mutually exclusive. + exclude_chromosome: type: string? default: "chrM" @@ -129,6 +139,14 @@ inputs: label: "Exclude chromosomes" doc: "Space separated list of chromosomes to be excluded" + do_not_scale: + type: boolean? + default: false + 'sd:layout': + advanced: true + label: "Do not scale genome coverage based on mapped reads number" + doc: "When converting BAM to bigWig, the scale will be set to 1 by default" + promoter_dist: type: int? default: 1000 @@ -280,19 +298,12 @@ outputs: data: [$1, $2] comparable: "atdp" - samtools_rmdup_log: - type: File - label: "Remove duplicates log" - format: "http://edamontology.org/format_2330" - doc: "Samtools rmdup generated log" - outputSource: samtools_rmdup/rmdup_log - bambai_pair: type: File format: "http://edamontology.org/format_2572" label: "Coordinate sorted BAM alignment file (+index BAI)" doc: "Coordinate sorted BAM file and BAI index file" - outputSource: samtools_sort_index_after_rmdup/bam_bai_pair + outputSource: samtools_remove_duplicates/deduplicated_bam_bai_pair 'sd:visualPlugins': - igvbrowser: tab: 'IGV Genome Browser' @@ -459,21 +470,22 @@ outputs: doc: "TrimGalore generated log for FASTQ 2" outputSource: trim_fastq/report_file_pair - preseq_estimates_plot_data: + preseq_estimates: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq_plot_data/estimates_file_plot_data + outputSource: preseq/estimates_file 'sd:visualPlugins': - - line: + - scatter: tab: 'QC Plots' - Title: 'Distinct Read Counts Estimates' - xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' - yAxisTitle: 'Distinct Reads Count' - colors: ["#4b78a3", "#a3514b"] + Title: 'Preseq Estimates' + xAxisTitle: 'Total reads count' + yAxisTitle: 'Expected distinct reads count' + colors: ["#4b78a3"] height: 500 - data: [$2, $5] + data: [$1, $2] + comparable: "preseq" estimated_fragment_size: type: int @@ -497,9 +509,9 @@ steps: the core facility by providing a URL or from GEO by providing SRA accession number. run: ../tools/extract-fastq.cwl in: - output_prefix: - default: "read_1" compressed_file: fastq_file_upstream + output_prefix: + default: "read_1" out: [fastq_file] extract_fastq_downstream: @@ -510,9 +522,9 @@ steps: the core facility by providing a URL or from GEO by providing SRA accession number. run: ../tools/extract-fastq.cwl in: - output_prefix: - default: "read_2" compressed_file: fastq_file_downstream + output_prefix: + default: "read_2" out: [fastq_file] trim_fastq: @@ -603,8 +615,11 @@ steps: bowtie_aligner: label: "Alignment to reference genome" doc: | - Aligns reads to the reference genome keeping only uniquely mapped reads with - less than 3 mismatches. + Aligns reads to the reference genome. + Reads are assumed to be mapped if they + have less than 3 mismatches. + sam_file output includes both mapped + and unmapped reads. run: ../tools/bowtie-alignreads.cwl in: upstream_filelist: rename_upstream/target_file @@ -640,19 +655,22 @@ steps: threads: threads out: [bam_bai_pair] + samtools_mark_duplicates: + run: ../tools/samtools-markdup.cwl + in: + bam_bai_pair: samtools_sort_index/bam_bai_pair + keep_duplicates: + default: true + threads: threads + out: [deduplicated_bam_bai_pair] + filter_bam: run: ../tools/samtools-filter.cwl in: - bam_bai_pair: samtools_sort_index/bam_bai_pair + bam_bai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair exclude_chromosome: exclude_chromosome out: [filtered_bam_bai_pair] - clean_sam_headers_for_preseq: - run: ../tools/samtools-clean-headers.cwl - in: - bam_file: filter_bam/filtered_bam_bai_pair - out: [preseq_bam] - preseq: label: "Sequencing depth estimation" doc: | @@ -660,32 +678,22 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: clean_sam_headers_for_preseq/preseq_bam + bam_file: filter_bam/filtered_bam_bai_pair pe_mode: default: true extrapolation: default: 1000000000 - out: [estimates_file, log_file_stdout, log_file_stderr] + out: [estimates_file] - samtools_rmdup: - label: "PCR duplicates removal" - doc: | - Removes potential PCR duplicates. This step is used to remove reads overamplified - in PCR. Unfortunately, it may also remove "good" reads. We do not recommend to - remove duplicates unless the library is heavily duplicated. - run: ../tools/samtools-rmdup.cwl - in: - trigger: remove_duplicates - bam_file: filter_bam/filtered_bam_bai_pair - out: [rmdup_output, rmdup_log] - - samtools_sort_index_after_rmdup: - run: ../tools/samtools-sort-index.cwl + samtools_remove_duplicates: + run: ../tools/samtools-markdup.cwl in: - trigger: remove_duplicates - sort_input: samtools_rmdup/rmdup_output + bam_bai_pair: filter_bam/filtered_bam_bai_pair + keep_duplicates: + source: remove_duplicates + valueFrom: $(!self) threads: threads - out: [bam_bai_pair] + out: [deduplicated_bam_bai_pair] macs2_callpeak: label: "Peak detection" @@ -694,7 +702,7 @@ steps: transcription factor binding sites. run: ../tools/macs2-callpeak-biowardrobe-only.cwl in: - treatment_file: samtools_sort_index_after_rmdup/bam_bai_pair + treatment_file: samtools_remove_duplicates/deduplicated_bam_bai_pair control_file: control_file nolambda: source: control_file @@ -713,8 +721,7 @@ steps: valueFrom: $(!self) keep_dup: default: auto - q_value: - default: 0.05 + q_value: peak_calling_fdr format_mode: default: BAMPE buffer_size: @@ -735,9 +742,11 @@ steps: bam_to_bigwig: run: ../tools/bam-bedgraph-bigwig.cwl in: - bam_file: samtools_sort_index_after_rmdup/bam_bai_pair + bam_file: samtools_remove_duplicates/deduplicated_bam_bai_pair chrom_length_file: chrom_length - mapped_reads_number: get_stat/mapped_reads + mapped_reads_number: + source: [do_not_scale, get_stat/mapped_reads] + valueFrom: $(self[0]?null:self[1]) fragment_size: default: 9 out: [bigwig_file] @@ -749,20 +758,20 @@ steps: read length and quality score, etc. run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index/bam_bai_pair + bambai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index/bam_bai_pair + source: samtools_mark_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") out: [log_file] get_bam_statistics_after_filtering: run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index_after_rmdup/bam_bai_pair + bambai_pair: samtools_remove_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index_after_rmdup/bam_bai_pair + source: samtools_remove_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report_after_filtering.txt") - out: [log_file, ext_is_section] + out: [log_file, ext_is_section, reads_mapped] get_stat: run: ../tools/collect-statistics-chip-seq.cwl @@ -773,23 +782,12 @@ steps: bam_statistics_report: get_bam_statistics/log_file bam_statistics_after_filtering_report: get_bam_statistics_after_filtering/log_file macs2_called_peaks: macs2_callpeak/peak_xls_file + atdp_results: average_tag_density/result_file preseq_results: preseq/estimates_file paired_end: default: True out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] - preseq_plot_data: - label: "Formats sequencing depth estimation data for plotting" - doc: | - Formats estimates file from preseq standard output for QC plotting. This adds a new - column that includes the actual read count point on the plot. - run: ../tools/preseq-plot-data.cwl - in: - preseq_stderr_log_file: preseq/log_file_stderr - estimates_file: preseq/estimates_file - mapped_reads: get_stat/mapped_reads - out: [estimates_file_plot_data] - island_intersect: label: "Peak annotation" doc: | @@ -810,7 +808,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_sort_index_after_rmdup/bam_bai_pair + input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: @@ -823,7 +821,9 @@ steps: default: "chrX chrY" avd_heat_window_bp: default: 200 - mapped_reads: get_stat/mapped_reads + mapped_reads: + source: get_bam_statistics_after_filtering/reads_mapped + valueFrom: $(parseInt(self/2)) out: [result_file, log_file] @@ -913,11 +913,8 @@ doc: | files to reference genome (Step bowtie_aligner). The output of this step is unsorted SAM file which is being sorted and indexed by samtools sort and samtools index (Step samtools_sort_index). Depending on workflow’s input parameters indexed and sorted BAM file - could be processed by samtools rmdup (Step samtools_rmdup) to remove all possible read duplicates. - In a case when removing duplicates is not necessary the step returns original input BAM and BAI - files without any processing. If the duplicates were removed the following step - (Step samtools_sort_index_after_rmdup) reruns samtools sort and samtools index with BAM and BAI files, - if not - the step returns original unchanged input files. Right after that macs2 callpeak performs + could be processed by `samtools markdup` *samtools\_remove\_duplicates* to remove all possible read duplicates. + Right after that macs2 callpeak performs peak calling (Step macs2_callpeak). On the base of returned outputs the next step (Step macs2_island_count) calculates the number of islands and estimated fragment size. If the last one is less that 80 (hardcoded in a workflow) macs2 callpeak is rerun again with forced fixed diff --git a/workflows/trim-atacseq-se.cwl b/workflows/trim-atacseq-se.cwl index 4c4d0901..ca85f219 100644 --- a/workflows/trim-atacseq-se.cwl +++ b/workflows/trim-atacseq-se.cwl @@ -67,7 +67,7 @@ inputs: - File - type: array items: File - label: "FASTQ input file" + label: "FASTQ input file(s)" format: "http://edamontology.org/format_1930" doc: "Reads data in a FASTQ format, received after single end sequencing" @@ -111,6 +111,16 @@ inputs: label: "Remove duplicates" doc: "Calls samtools rmdup to remove duplicates from sortesd BAM file" + peak_calling_fdr: + type: float? + default: 0.05 + 'sd:layout': + advanced: true + label: "Minimum FDR (q-value) cutoff for peak detection" + doc: | + Minimum FDR (q-value) cutoff for peak detection. -q, and + -p are mutually exclusive. + exclude_chromosome: type: string? default: "chrM" @@ -119,6 +129,14 @@ inputs: label: "Exclude chromosomes" doc: "Space separated list of chromosomes to be excluded" + do_not_scale: + type: boolean? + default: false + 'sd:layout': + advanced: true + label: "Do not scale genome coverage based on mapped reads number" + doc: "When converting BAM to bigWig, the scale will be set to 1 by default" + promoter_dist: type: int? default: 1000 @@ -248,19 +266,12 @@ outputs: data: [$1, $2] comparable: "atdp" - samtools_rmdup_log: - type: File - label: "Remove duplicates log" - format: "http://edamontology.org/format_2330" - doc: "Samtools rmdup generated log" - outputSource: samtools_rmdup/rmdup_log - bambai_pair: type: File format: "http://edamontology.org/format_2572" label: "Coordinate sorted BAM alignment file (+index BAI)" doc: "Coordinate sorted BAM file and BAI index file" - outputSource: samtools_sort_index_after_rmdup/bam_bai_pair + outputSource: samtools_remove_duplicates/deduplicated_bam_bai_pair 'sd:visualPlugins': - igvbrowser: tab: 'IGV Genome Browser' @@ -404,21 +415,22 @@ outputs: doc: "TrimGalore generated log" outputSource: trim_fastq/report_file - preseq_estimates_plot_data: + preseq_estimates: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq_plot_data/estimates_file_plot_data + outputSource: preseq/estimates_file 'sd:visualPlugins': - - line: + - scatter: tab: 'QC Plots' - Title: 'Distinct Read Counts Estimates' - xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' - yAxisTitle: 'Distinct Reads Count' - colors: ["#4b78a3", "#a3514b"] + Title: 'Preseq Estimates' + xAxisTitle: 'Total reads count' + yAxisTitle: 'Expected distinct reads count' + colors: ["#4b78a3"] height: 500 - data: [$2, $5] + data: [$1, $2] + comparable: "preseq" estimated_fragment_size: type: int @@ -443,6 +455,8 @@ steps: run: ../tools/extract-fastq.cwl in: compressed_file: fastq_file + output_prefix: + default: "read_1" out: [fastq_file] trim_fastq: @@ -500,8 +514,11 @@ steps: bowtie_aligner: label: "Alignment to reference genome" doc: | - Aligns reads to the reference genome keeping only uniquely mapped reads with - less than 3 mismatches. + Aligns reads to the reference genome. + Reads are assumed to be mapped if they + have less than 3 mismatches. + sam_file output includes both mapped + and unmapped reads. run: ../tools/bowtie-alignreads.cwl in: upstream_filelist: rename/target_file @@ -536,19 +553,22 @@ steps: threads: threads out: [bam_bai_pair] + samtools_mark_duplicates: + run: ../tools/samtools-markdup.cwl + in: + bam_bai_pair: samtools_sort_index/bam_bai_pair + keep_duplicates: + default: true + threads: threads + out: [deduplicated_bam_bai_pair] + filter_bam: run: ../tools/samtools-filter.cwl in: - bam_bai_pair: samtools_sort_index/bam_bai_pair + bam_bai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair exclude_chromosome: exclude_chromosome out: [filtered_bam_bai_pair] - clean_sam_headers_for_preseq: - run: ../tools/samtools-clean-headers.cwl - in: - bam_file: filter_bam/filtered_bam_bai_pair - out: [preseq_bam] - preseq: label: "Sequencing depth estimation" doc: | @@ -556,32 +576,20 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: clean_sam_headers_for_preseq/preseq_bam + bam_file: filter_bam/filtered_bam_bai_pair extrapolation: default: 1000000000 - out: [estimates_file, log_file_stdout, log_file_stderr] - - samtools_rmdup: - label: "PCR duplicates removal" - doc: | - Removes potential PCR duplicates. This step is used to remove reads overamplified - in PCR. Unfortunately, it may also remove "good" reads. We do not recommend to - remove duplicates unless the library is heavily duplicated. - run: ../tools/samtools-rmdup.cwl - in: - trigger: remove_duplicates - bam_file: filter_bam/filtered_bam_bai_pair - single_end: - default: true - out: [rmdup_output, rmdup_log] + out: [estimates_file] - samtools_sort_index_after_rmdup: - run: ../tools/samtools-sort-index.cwl + samtools_remove_duplicates: + run: ../tools/samtools-markdup.cwl in: - trigger: remove_duplicates - sort_input: samtools_rmdup/rmdup_output + bam_bai_pair: filter_bam/filtered_bam_bai_pair + keep_duplicates: + source: remove_duplicates + valueFrom: $(!self) threads: threads - out: [bam_bai_pair] + out: [deduplicated_bam_bai_pair] macs2_callpeak: label: "Peak detection" @@ -590,7 +598,7 @@ steps: transcription factor binding sites. run: ../tools/macs2-callpeak-biowardrobe-only.cwl in: - treatment_file: samtools_sort_index_after_rmdup/bam_bai_pair + treatment_file: samtools_remove_duplicates/deduplicated_bam_bai_pair control_file: control_file nolambda: source: control_file @@ -609,8 +617,7 @@ steps: valueFrom: $(!self) keep_dup: default: auto - q_value: - default: 0.05 + q_value: peak_calling_fdr format_mode: default: BAM buffer_size: @@ -631,9 +638,11 @@ steps: bam_to_bigwig: run: ../tools/bam-bedgraph-bigwig.cwl in: - bam_file: samtools_sort_index_after_rmdup/bam_bai_pair + bam_file: samtools_remove_duplicates/deduplicated_bam_bai_pair chrom_length_file: chrom_length - mapped_reads_number: get_stat/mapped_reads + mapped_reads_number: + source: [do_not_scale, get_stat/mapped_reads] + valueFrom: $(self[0]?null:self[1]) fragment_size: default: 9 out: [bigwig_file] @@ -645,20 +654,20 @@ steps: read length and quality score, etc. run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index/bam_bai_pair + bambai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index/bam_bai_pair + source: samtools_mark_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") out: [log_file] get_bam_statistics_after_filtering: run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index_after_rmdup/bam_bai_pair + bambai_pair: samtools_remove_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index_after_rmdup/bam_bai_pair + source: samtools_remove_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report_after_filtering.txt") - out: [log_file] + out: [log_file, reads_mapped] get_stat: run: ../tools/collect-statistics-chip-seq.cwl @@ -668,21 +677,10 @@ steps: bam_statistics_report: get_bam_statistics/log_file bam_statistics_after_filtering_report: get_bam_statistics_after_filtering/log_file macs2_called_peaks: macs2_callpeak/peak_xls_file + atdp_results: average_tag_density/result_file preseq_results: preseq/estimates_file out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] - preseq_plot_data: - label: "Formats sequencing depth estimation data for plotting" - doc: | - Formats estimates file from preseq standard output for QC plotting. This adds a new - column that includes the actual read count point on the plot. - run: ../tools/preseq-plot-data.cwl - in: - preseq_stderr_log_file: preseq/log_file_stderr - estimates_file: preseq/estimates_file - mapped_reads: get_stat/mapped_reads - out: [estimates_file_plot_data] - island_intersect: label: "Peak annotation" doc: | @@ -703,7 +701,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_sort_index_after_rmdup/bam_bai_pair + input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: @@ -716,7 +714,9 @@ steps: default: "chrX chrY" avd_heat_window_bp: default: 200 - mapped_reads: get_stat/mapped_reads + mapped_reads: + source: get_bam_statistics_after_filtering/reads_mapped + valueFrom: $(parseInt(self)) out: [result_file, log_file] @@ -805,9 +805,7 @@ doc: | *samtools\_sort\_index*. Based on workflow’s input parameters indexed and sorted BAM file - can be processed by `samtools rmdup` *samtools\_rmdup* to get rid of duplicated reads. - If removing duplicates is not required the original input BAM and BAI - files return. Otherwise step *samtools\_sort\_index\_after\_rmdup* repeat `samtools sort` and `samtools index` with BAM and BAI files. + can be processed by `samtools markdup` *samtools\_remove\_duplicates* to get rid of duplicated reads. Right after that `macs2 callpeak` performs peak calling *macs2\_callpeak*. On the base of returned outputs the next step *macs2\_island\_count* calculates the number of islands and estimated fragment size. If the last diff --git a/workflows/trim-chipseq-pe.cwl b/workflows/trim-chipseq-pe.cwl index 5d658f2a..e10519f8 100644 --- a/workflows/trim-chipseq-pe.cwl +++ b/workflows/trim-chipseq-pe.cwl @@ -68,7 +68,7 @@ inputs: - File - type: array items: File - label: "FASTQ 1 input file" + label: "FASTQ 1 input file(s)" format: "http://edamontology.org/format_1930" doc: "Reads data in a FASTQ format, received after paired end sequencing" @@ -77,7 +77,7 @@ inputs: - File - type: array items: File - label: "FASTQ 2 input file" + label: "FASTQ 2 input file(s)" format: "http://edamontology.org/format_1930" doc: "Reads data in a FASTQ format, received after paired end sequencing" @@ -121,6 +121,16 @@ inputs: label: "Remove duplicates" doc: "Calls samtools rmdup to remove duplicates from sortesd BAM file" + peak_calling_fdr: + type: float? + default: 0.05 + 'sd:layout': + advanced: true + label: "Minimum FDR (q-value) cutoff for peak detection" + doc: | + Minimum FDR (q-value) cutoff for peak detection. -q, and + -p are mutually exclusive. + promoter_dist: type: int? default: 1000 @@ -272,19 +282,12 @@ outputs: data: [$1, $2] comparable: "atdp" - samtools_rmdup_log: - type: File - label: "Remove duplicates log" - format: "http://edamontology.org/format_2330" - doc: "Samtools rmdup generated log" - outputSource: samtools_rmdup/rmdup_log - bambai_pair: type: File format: "http://edamontology.org/format_2572" label: "Coordinate sorted BAM alignment file (+index BAI)" doc: "Coordinate sorted BAM file and BAI index file" - outputSource: samtools_sort_index_after_rmdup/bam_bai_pair + outputSource: samtools_remove_duplicates/deduplicated_bam_bai_pair 'sd:visualPlugins': - igvbrowser: tab: 'IGV Genome Browser' @@ -451,21 +454,22 @@ outputs: doc: "TrimGalore generated log for FASTQ 2" outputSource: trim_fastq/report_file_pair - preseq_estimates_plot_data: + preseq_estimates: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq_plot_data/estimates_file_plot_data + outputSource: preseq/estimates_file 'sd:visualPlugins': - - line: + - scatter: tab: 'QC Plots' - Title: 'Distinct Read Counts Estimates' - xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' - yAxisTitle: 'Distinct Reads Count' - colors: ["#4b78a3", "#a3514b"] + Title: 'Preseq Estimates' + xAxisTitle: 'Total reads count' + yAxisTitle: 'Expected distinct reads count' + colors: ["#4b78a3"] height: 500 - data: [$2, $5] + data: [$1, $2] + comparable: "preseq" estimated_fragment_size: type: int @@ -489,9 +493,9 @@ steps: the core facility by providing a URL or from GEO by providing SRA accession number. run: ../tools/extract-fastq.cwl in: - output_prefix: - default: "read_1" compressed_file: fastq_file_upstream + output_prefix: + default: "read_1" out: [fastq_file] extract_fastq_downstream: @@ -502,9 +506,9 @@ steps: the core facility by providing a URL or from GEO by providing SRA accession number. run: ../tools/extract-fastq.cwl in: - output_prefix: - default: "read_2" compressed_file: fastq_file_downstream + output_prefix: + default: "read_2" out: [fastq_file] trim_fastq: @@ -595,8 +599,11 @@ steps: bowtie_aligner: label: "Alignment to reference genome" doc: | - Aligns reads to the reference genome keeping only uniquely mapped reads with - less than 3 mismatches. + Aligns reads to the reference genome. + Reads are assumed to be mapped if they + have less than 3 mismatches. + sam_file output includes both mapped + and unmapped reads. run: ../tools/bowtie-alignreads.cwl in: upstream_filelist: rename_upstream/target_file @@ -632,11 +639,14 @@ steps: threads: threads out: [bam_bai_pair] - clean_sam_headers_for_preseq: - run: ../tools/samtools-clean-headers.cwl + samtools_mark_duplicates: + run: ../tools/samtools-markdup.cwl in: - bam_file: samtools_sort_index/bam_bai_pair - out: [preseq_bam] + bam_bai_pair: samtools_sort_index/bam_bai_pair + keep_duplicates: + default: true + threads: threads + out: [deduplicated_bam_bai_pair] preseq: label: "Sequencing depth estimation" @@ -645,32 +655,22 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: clean_sam_headers_for_preseq/preseq_bam + bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair pe_mode: default: true extrapolation: default: 1000000000 - out: [estimates_file, log_file_stdout, log_file_stderr] + out: [estimates_file] - samtools_rmdup: - label: "PCR duplicates removal" - doc: | - Removes potential PCR duplicates. This step is used to remove reads overamplified - in PCR. Unfortunately, it may also remove "good" reads. We do not recommend to - remove duplicates unless the library is heavily duplicated. - run: ../tools/samtools-rmdup.cwl + samtools_remove_duplicates: + run: ../tools/samtools-markdup.cwl in: - trigger: remove_duplicates - bam_file: samtools_sort_index/bam_bai_pair - out: [rmdup_output, rmdup_log] - - samtools_sort_index_after_rmdup: - run: ../tools/samtools-sort-index.cwl - in: - trigger: remove_duplicates - sort_input: samtools_rmdup/rmdup_output + bam_bai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair + keep_duplicates: + source: remove_duplicates + valueFrom: $(!self) threads: threads - out: [bam_bai_pair] + out: [deduplicated_bam_bai_pair] macs2_callpeak: label: "Peak detection" @@ -679,7 +679,7 @@ steps: transcription factor binding sites. run: ../tools/macs2-callpeak-biowardrobe-only.cwl in: - treatment_file: samtools_sort_index_after_rmdup/bam_bai_pair + treatment_file: samtools_remove_duplicates/deduplicated_bam_bai_pair control_file: control_file nolambda: source: control_file @@ -698,8 +698,7 @@ steps: valueFrom: $(!self) keep_dup: default: auto - q_value: - default: 0.05 + q_value: peak_calling_fdr format_mode: default: BAMPE buffer_size: @@ -720,7 +719,7 @@ steps: bam_to_bigwig: run: ../tools/bam-bedgraph-bigwig.cwl in: - bam_file: samtools_sort_index_after_rmdup/bam_bai_pair + bam_file: samtools_remove_duplicates/deduplicated_bam_bai_pair chrom_length_file: chrom_length mapped_reads_number: get_stat/mapped_reads pairchip: @@ -734,20 +733,20 @@ steps: read length and quality score, etc. run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index/bam_bai_pair + bambai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index/bam_bai_pair + source: samtools_mark_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") out: [log_file] get_bam_statistics_after_filtering: run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index_after_rmdup/bam_bai_pair + bambai_pair: samtools_remove_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index_after_rmdup/bam_bai_pair + source: samtools_remove_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report_after_filtering.txt") - out: [log_file, ext_is_section] + out: [log_file, ext_is_section, reads_mapped] get_stat: run: ../tools/collect-statistics-chip-seq.cwl @@ -758,23 +757,12 @@ steps: bam_statistics_report: get_bam_statistics/log_file bam_statistics_after_filtering_report: get_bam_statistics_after_filtering/log_file macs2_called_peaks: macs2_callpeak/peak_xls_file + atdp_results: average_tag_density/result_file preseq_results: preseq/estimates_file paired_end: default: True out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] - preseq_plot_data: - label: "Formats sequencing depth estimation data for plotting" - doc: | - Formats estimates file from preseq standard output for QC plotting. This adds a new - column that includes the actual read count point on the plot. - run: ../tools/preseq-plot-data.cwl - in: - preseq_stderr_log_file: preseq/log_file_stderr - estimates_file: preseq/estimates_file - mapped_reads: get_stat/mapped_reads - out: [estimates_file_plot_data] - island_intersect: label: "Peak annotation" doc: | @@ -795,7 +783,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_sort_index_after_rmdup/bam_bai_pair + input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: @@ -808,7 +796,9 @@ steps: default: "chrX chrY" avd_heat_window_bp: default: 200 - mapped_reads: get_stat/mapped_reads + mapped_reads: + source: get_bam_statistics_after_filtering/reads_mapped + valueFrom: $(parseInt(self/2)) out: [result_file, log_file] @@ -894,11 +884,8 @@ doc: | files to reference genome (Step bowtie_aligner). The output of this step is unsorted SAM file which is being sorted and indexed by samtools sort and samtools index (Step samtools_sort_index). Depending on workflow’s input parameters indexed and sorted BAM file - could be processed by samtools rmdup (Step samtools_rmdup) to remove all possible read duplicates. - In a case when removing duplicates is not necessary the step returns original input BAM and BAI - files without any processing. If the duplicates were removed the following step - (Step samtools_sort_index_after_rmdup) reruns samtools sort and samtools index with BAM and BAI files, - if not - the step returns original unchanged input files. Right after that macs2 callpeak performs + could be processed by `samtools markdup` *samtools\_remove\_duplicates* to remove all possible read duplicates. + Right after that macs2 callpeak performs peak calling (Step macs2_callpeak). On the base of returned outputs the next step (Step macs2_island_count) calculates the number of islands and estimated fragment size. If the last one is less that 80 (hardcoded in a workflow) macs2 callpeak is rerun again with forced fixed diff --git a/workflows/trim-chipseq-se.cwl b/workflows/trim-chipseq-se.cwl index f12ff0a6..687ac248 100644 --- a/workflows/trim-chipseq-se.cwl +++ b/workflows/trim-chipseq-se.cwl @@ -68,7 +68,7 @@ inputs: - File - type: array items: File - label: "FASTQ input file" + label: "FASTQ input file(s)" format: "http://edamontology.org/format_1930" doc: "Reads data in a FASTQ format, received after single end sequencing" @@ -112,6 +112,16 @@ inputs: label: "Remove duplicates" doc: "Calls samtools rmdup to remove duplicates from sortesd BAM file" + peak_calling_fdr: + type: float? + default: 0.05 + 'sd:layout': + advanced: true + label: "Minimum FDR (q-value) cutoff for peak detection" + doc: | + Minimum FDR (q-value) cutoff for peak detection. -q, and + -p are mutually exclusive. + promoter_dist: type: int? default: 1000 @@ -241,19 +251,12 @@ outputs: data: [$1, $2] comparable: "atdp" - samtools_rmdup_log: - type: File - label: "Remove duplicates log" - format: "http://edamontology.org/format_2330" - doc: "Samtools rmdup generated log" - outputSource: samtools_rmdup/rmdup_log - bambai_pair: type: File format: "http://edamontology.org/format_2572" label: "Coordinate sorted BAM alignment file (+index BAI)" doc: "Coordinate sorted BAM file and BAI index file" - outputSource: samtools_sort_index_after_rmdup/bam_bai_pair + outputSource: samtools_remove_duplicates/deduplicated_bam_bai_pair 'sd:visualPlugins': - igvbrowser: tab: 'IGV Genome Browser' @@ -397,21 +400,22 @@ outputs: doc: "TrimGalore generated log" outputSource: trim_fastq/report_file - preseq_estimates_plot_data: + preseq_estimates: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq_plot_data/estimates_file_plot_data + outputSource: preseq/estimates_file 'sd:visualPlugins': - - line: + - scatter: tab: 'QC Plots' - Title: 'Distinct Read Counts Estimates' - xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' - yAxisTitle: 'Distinct Reads Count' - colors: ["#4b78a3", "#a3514b"] + Title: 'Preseq Estimates' + xAxisTitle: 'Total reads count' + yAxisTitle: 'Expected distinct reads count' + colors: ["#4b78a3"] height: 500 - data: [$2, $5] + data: [$1, $2] + comparable: "preseq" estimated_fragment_size: type: int @@ -436,6 +440,8 @@ steps: run: ../tools/extract-fastq.cwl in: compressed_file: fastq_file + output_prefix: + default: "read_1" out: [fastq_file] trim_fastq: @@ -493,8 +499,11 @@ steps: bowtie_aligner: label: "Alignment to reference genome" doc: | - Aligns reads to the reference genome keeping only uniquely mapped reads with - less than 3 mismatches. + Aligns reads to the reference genome. + Reads are assumed to be mapped if they + have less than 3 mismatches. + sam_file output includes both mapped + and unmapped reads. run: ../tools/bowtie-alignreads.cwl in: upstream_filelist: rename/target_file @@ -529,11 +538,14 @@ steps: threads: threads out: [bam_bai_pair] - clean_sam_headers_for_preseq: - run: ../tools/samtools-clean-headers.cwl + samtools_mark_duplicates: + run: ../tools/samtools-markdup.cwl in: - bam_file: samtools_sort_index/bam_bai_pair - out: [preseq_bam] + bam_bai_pair: samtools_sort_index/bam_bai_pair + keep_duplicates: + default: true + threads: threads + out: [deduplicated_bam_bai_pair] preseq: label: "Sequencing depth estimation" @@ -542,32 +554,20 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: clean_sam_headers_for_preseq/preseq_bam + bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair extrapolation: default: 1000000000 - out: [estimates_file, log_file_stdout, log_file_stderr] + out: [estimates_file] - samtools_rmdup: - label: "PCR duplicates removal" - doc: | - Removes potential PCR duplicates. This step is used to remove reads overamplified - in PCR. Unfortunately, it may also remove "good" reads. We do not recommend to - remove duplicates unless the library is heavily duplicated. - run: ../tools/samtools-rmdup.cwl - in: - trigger: remove_duplicates - bam_file: samtools_sort_index/bam_bai_pair - single_end: - default: true - out: [rmdup_output, rmdup_log] - - samtools_sort_index_after_rmdup: - run: ../tools/samtools-sort-index.cwl + samtools_remove_duplicates: + run: ../tools/samtools-markdup.cwl in: - trigger: remove_duplicates - sort_input: samtools_rmdup/rmdup_output + bam_bai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair + keep_duplicates: + source: remove_duplicates + valueFrom: $(!self) threads: threads - out: [bam_bai_pair] + out: [deduplicated_bam_bai_pair] macs2_callpeak: label: "Peak detection" @@ -576,7 +576,7 @@ steps: transcription factor binding sites. run: ../tools/macs2-callpeak-biowardrobe-only.cwl in: - treatment_file: samtools_sort_index_after_rmdup/bam_bai_pair + treatment_file: samtools_remove_duplicates/deduplicated_bam_bai_pair control_file: control_file nolambda: source: control_file @@ -595,8 +595,7 @@ steps: valueFrom: $(!self) keep_dup: default: auto - q_value: - default: 0.05 + q_value: peak_calling_fdr format_mode: default: BAM buffer_size: @@ -617,7 +616,7 @@ steps: bam_to_bigwig: run: ../tools/bam-bedgraph-bigwig.cwl in: - bam_file: samtools_sort_index_after_rmdup/bam_bai_pair + bam_file: samtools_remove_duplicates/deduplicated_bam_bai_pair chrom_length_file: chrom_length mapped_reads_number: get_stat/mapped_reads fragment_size: macs2_callpeak/macs2_fragments_calculated @@ -630,20 +629,20 @@ steps: read length and quality score, etc. run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index/bam_bai_pair + bambai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index/bam_bai_pair + source: samtools_mark_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") out: [log_file] get_bam_statistics_after_filtering: run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index_after_rmdup/bam_bai_pair + bambai_pair: samtools_remove_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index_after_rmdup/bam_bai_pair + source: samtools_remove_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report_after_filtering.txt") - out: [log_file] + out: [log_file, reads_mapped] get_stat: run: ../tools/collect-statistics-chip-seq.cwl @@ -653,21 +652,10 @@ steps: bam_statistics_report: get_bam_statistics/log_file bam_statistics_after_filtering_report: get_bam_statistics_after_filtering/log_file macs2_called_peaks: macs2_callpeak/peak_xls_file + atdp_results: average_tag_density/result_file preseq_results: preseq/estimates_file out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] - preseq_plot_data: - label: "Formats sequencing depth estimation data for plotting" - doc: | - Formats estimates file from preseq standard output for QC plotting. This adds a new - column that includes the actual read count point on the plot. - run: ../tools/preseq-plot-data.cwl - in: - preseq_stderr_log_file: preseq/log_file_stderr - estimates_file: preseq/estimates_file - mapped_reads: get_stat/mapped_reads - out: [estimates_file_plot_data] - island_intersect: label: "Peak annotation" doc: | @@ -688,7 +676,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_sort_index_after_rmdup/bam_bai_pair + input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: @@ -701,7 +689,9 @@ steps: default: "chrX chrY" avd_heat_window_bp: default: 200 - mapped_reads: get_stat/mapped_reads + mapped_reads: + source: get_bam_statistics_after_filtering/reads_mapped + valueFrom: $(parseInt(self)) out: [result_file, log_file] @@ -789,9 +779,7 @@ doc: | *samtools\_sort\_index*. Based on workflow’s input parameters indexed and sorted BAM file - can be processed by `samtools rmdup` *samtools\_rmdup* to get rid of duplicated reads. - If removing duplicates is not required the original input BAM and BAI - files return. Otherwise step *samtools\_sort\_index\_after\_rmdup* repeat `samtools sort` and `samtools index` with BAM and BAI files. + can be processed by `samtools markdup` *samtools\_remove\_duplicates* to get rid of duplicated reads. Right after that `macs2 callpeak` performs peak calling *macs2\_callpeak*. On the base of returned outputs the next step *macs2\_island\_count* calculates the number of islands and estimated fragment size. If the last From 4c9fc6997e9da8deb1709e1fdd1ae768549ff2ec Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 14 Sep 2023 15:53:34 -0400 Subject: [PATCH 074/162] ATAC and ChIP workflows update --- tools/collect-statistics-chip-seq.cwl | 53 ++++++-- tools/samtools-filter.cwl | 40 +++--- tools/samtools-markdup.cwl | 184 ++++++++++++++++++++++++++ workflows/chipseq-pe.cwl | 157 +++++++++++----------- workflows/chipseq-se.cwl | 150 ++++++++++----------- workflows/trim-atacseq-pe.cwl | 155 +++++++++++----------- workflows/trim-atacseq-se.cwl | 146 ++++++++++---------- workflows/trim-chipseq-pe.cwl | 137 +++++++++---------- workflows/trim-chipseq-se.cwl | 128 ++++++++---------- 9 files changed, 666 insertions(+), 484 deletions(-) create mode 100644 tools/samtools-markdup.cwl diff --git a/tools/collect-statistics-chip-seq.cwl b/tools/collect-statistics-chip-seq.cwl index 21890927..1074d859 100644 --- a/tools/collect-statistics-chip-seq.cwl +++ b/tools/collect-statistics-chip-seq.cwl @@ -18,7 +18,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: rackspacedot/python37 + dockerPull: biowardrobe2/sc-tools:v0.0.29 inputs: @@ -26,19 +26,20 @@ inputs: script: type: string? default: | - #!/usr/bin/env python + #!/usr/bin/env python3 import os import sys import argparse + import pandas import yaml import math def cut_int(s): - return int(s.strip().split()[0]) + return int(str(s).strip().split()[0]) def cut_float(s): - return float(s.strip().split()[0]) + return float(str(s).strip().split()[0]) TRIMGALORE = { @@ -149,6 +150,11 @@ inputs: "function": int, "pair_end_specific": True }, + "reads duplicated": { + "alias": "reads/pairs duplicated", + "function": cut_int, + "pair_end_specific": True + }, "average length": { "alias": "reads average length", "function": float, @@ -177,6 +183,7 @@ inputs: "order": ["total reads/pairs", "reads/pairs mapped", "reads/pairs unmapped", + "reads/pairs duplicated", "reads average length", "reads maximum length", "reads average quality", @@ -228,6 +235,7 @@ inputs: general_parser.add_argument("--bamstats", help="Path to bam statistics report file", required=True) general_parser.add_argument("--bamstatsfilter", help="Path to bam statistics report file after filtering", required=True) general_parser.add_argument("--macs2", help="Path to MACS2 called peaks xls file", required=True) + general_parser.add_argument("--atdp", help="Path to ATDP output TSV file", required=True) general_parser.add_argument("--preseq", help="Path to Preseq output file", required=False) general_parser.add_argument("--paired", help="Process as paired-end. Default: False", action="store_true") general_parser.add_argument("--output", help="Output filename prefix", required=True) @@ -297,7 +305,7 @@ inputs: res_key, res_function, pair_end_specific = get_correspondent_key(key_dict, key) if not collected_results[header].get(res_key, None): if pair_end_specific and pair_end: - collected_results[header][res_key] = res_function(res_function(value)/2) + collected_results[header][res_key] = res_function(int(res_function(value)/2)) else: collected_results[header][res_key] = res_function(value) except Exception: @@ -331,6 +339,12 @@ inputs: collected_results[header] = {k: collected_results[header][k] for k in MACS2["order"] if k in collected_results[header]} + def process_atdp_results(filepath, collected_results, header): + if not collected_results.get(header, None): + collected_results[header] = {} + collected_results[header]["maximum"] = str(pandas.read_csv(filepath, sep="\t")["Y"].max()) + + def process_preseq_results(filepath, collected_results, header, threashold=0.001): px, py = 0, 0 for line in open_file(filepath): @@ -357,6 +371,7 @@ inputs: process_custom_report(args.bamstatsfilter, collected_results, "BAM statistics after filtering", BAMSTATS, bool(args.paired)) process_custom_report(args.macs2, collected_results, "peak calling statistics", MACS2) process_macs2_xls(args.macs2, collected_results, "peak calling statistics") + process_atdp_results(args.atdp, collected_results, "average tag density") if args.preseq: process_preseq_results(args.preseq, collected_results, "library preparation") return (collected_results) @@ -406,6 +421,7 @@ inputs: "total reads/pairs", "reads/pairs mapped", "reads/pairs unmapped", + "reads/pairs duplicated", "insert size average", "insert size standard deviation", "reads average length", @@ -416,6 +432,7 @@ inputs: "total reads/pairs", "reads/pairs mapped", "reads/pairs unmapped", + "reads/pairs duplicated", "insert size average", "insert size standard deviation", "reads average length", @@ -428,7 +445,10 @@ inputs: "total reads/pairs in treatment", "reads/pairs after filtering in treatment", "redundant rate in treatment", - "fraction of reads in peaks"] + "fraction of reads in peaks", + + "average tag density", + "maximum"] if collected_data.get("adapter trimming statistics", None): header.extend(["adapter trimming statistics", @@ -464,6 +484,7 @@ inputs: collected_data["BAM statistics"]["total reads/pairs"], collected_data["BAM statistics"]["reads/pairs mapped"], collected_data["BAM statistics"]["reads/pairs unmapped"], + collected_data["BAM statistics"]["reads/pairs duplicated"], collected_data["BAM statistics"]["insert size average"], collected_data["BAM statistics"]["insert size standard deviation"], collected_data["BAM statistics"]["reads average length"], @@ -474,6 +495,7 @@ inputs: collected_data["BAM statistics after filtering"]["total reads/pairs"], collected_data["BAM statistics after filtering"]["reads/pairs mapped"], collected_data["BAM statistics after filtering"]["reads/pairs unmapped"], + collected_data["BAM statistics after filtering"]["reads/pairs duplicated"], collected_data["BAM statistics after filtering"]["insert size average"], collected_data["BAM statistics after filtering"]["insert size standard deviation"], collected_data["BAM statistics after filtering"]["reads average length"], @@ -486,7 +508,10 @@ inputs: collected_data["peak calling statistics"]["total reads/pairs in treatment"], collected_data["peak calling statistics"]["reads/pairs after filtering in treatment"], collected_data["peak calling statistics"]["redundant rate in treatment"], - collected_data["peak calling statistics"]["fraction of reads in peaks"]] + collected_data["peak calling statistics"]["fraction of reads in peaks"], + + "", + collected_data["average tag density"]["maximum"]] if collected_data.get("adapter trimming statistics", None): data.extend(["", @@ -559,22 +584,28 @@ inputs: position: 11 prefix: "--macs2" + atdp_results: + type: File + inputBinding: + position: 12 + prefix: "--atdp" + preseq_results: type: File? inputBinding: - position: 12 + position: 13 prefix: "--preseq" paired_end: type: boolean? inputBinding: - position: 13 + position: 14 prefix: "--paired" output_prefix: type: string? inputBinding: - position: 14 + position: 15 prefix: "--output" valueFrom: $(get_output_prefix()) default: "" @@ -605,7 +636,7 @@ outputs: outputEval: $(parseInt(self[0].contents.split('\n')[1].split('\t')[1])) -baseCommand: [python, '-c'] +baseCommand: [python3, '-c'] $namespaces: diff --git a/tools/samtools-filter.cwl b/tools/samtools-filter.cwl index 531a4fe4..f1d3012c 100644 --- a/tools/samtools-filter.cwl +++ b/tools/samtools-filter.cwl @@ -3,11 +3,8 @@ class: CommandLineTool requirements: - class: InlineJavascriptRequirement - - -hints: - class: DockerRequirement - dockerPull: biowardrobe2/samtools:v1.4 + dockerPull: biowardrobe2/samtools:v1.11 inputs: @@ -16,15 +13,20 @@ inputs: type: string? default: | #!/bin/bash + echo "Copy $0 to temp.bam" + cp $0 temp.bam + samtools sort temp.bam -o temp_sorted.bam + samtools index temp_sorted.bam echo "Filtering BAM file" - echo "samtools idxstats $0 | cut -f 1 | grep -v -E \"`echo $1 | sed -e 's/ /$|/g'`$|\*\" | xargs samtools view -q $2 -o $3 $0" - samtools idxstats $0 | cut -f 1 | grep -v -E "`echo $1 | sed -e 's/ /$|/g'`$|\*" | xargs samtools view -q $2 -o $3 $0 + echo "samtools idxstats temp_sorted.bam | cut -f 1 | grep -v -E \"`echo $1 | sed -e 's/ /$|/g'`$|\*\" | xargs samtools view -q $2 -F $3 -o temp_filtered.bam temp_sorted.bam" + samtools idxstats temp_sorted.bam | cut -f 1 | grep -v -E "`echo $1 | sed -e 's/ /$|/g'`$|\*" | xargs samtools view -q $2 -F $3 -o temp_filtered.bam temp_sorted.bam echo "Sorting BAM file" - echo "samtools sort $3 -o $3" - samtools sort $3 -o $3 + echo "samtools sort temp_filtered.bam -o $4" + samtools sort temp_filtered.bam -o $4 echo "Indexing BAM file" - echo "samtools index $3" - samtools index $3 + echo "samtools index $4" + samtools index $4 + rm -f temp* inputBinding: position: 5 doc: "Script to exclude chromosomes from the BAM file and filter reads by quality" @@ -49,11 +51,18 @@ inputs: position: 8 default: 0 doc: "Skip alignments with MAPQ smaller than INT. Default 0" - + + negative_flag: + type: int? + inputBinding: + position: 9 + default: 0 + doc: "Do not output alignments with any bits set in INT present in the FLAG field. Default 0" + output_filename: type: string? inputBinding: - position: 9 + position: 10 valueFrom: | ${ return (self == "")?inputs.bam_bai_pair.basename:self; @@ -121,8 +130,9 @@ s:creator: - id: http://orcid.org/0000-0002-6486-3898 doc: | - Excludes chromosomes from the input BAM file. Filters reads by quality. - If there is only one chromosome present, you cannot exclude it + Excludes chromosomes from the input BAM file. + Optionally filters reads by quality and flags s:about: | - Excludes chromosomes from the input BAM file \ No newline at end of file + Excludes chromosomes from the input BAM file. + Optionally filters reads by quality and flags \ No newline at end of file diff --git a/tools/samtools-markdup.cwl b/tools/samtools-markdup.cwl new file mode 100644 index 00000000..b777f859 --- /dev/null +++ b/tools/samtools-markdup.cwl @@ -0,0 +1,184 @@ +cwlVersion: v1.0 +class: CommandLineTool + +requirements: +- class: InlineJavascriptRequirement +- class: InitialWorkDirRequirement + listing: | + ${ + return [ + { + "entry": inputs.bam_bai_pair, + "entryname": inputs.bam_bai_pair.basename, + "writable": true + } + ] + } +- class: DockerRequirement + dockerPull: biowardrobe2/samtools:v1.11 + + +inputs: + + script: + type: string? + default: | + + #!/bin/bash + echo "Rename $0 to temp.bam" + mv $0 temp.bam + if [ -f $0.bai ]; then + echo "Rename $0.bai to temp.bam.bai" + mv $0.bai temp.bam.bai + fi + + echo "Sorting BAM file by name" + echo "samtools sort -n -@ $3 -o namesorted.bam temp.bam" + samtools sort -n -@ $3 -o namesorted.bam temp.bam + + echo "Filling in mate coordinates and inserting size fields" + echo "samtools fixmate -m -@ $3 namesorted.bam fixed.bam" + samtools fixmate -m -@ $3 namesorted.bam fixed.bam + + echo "Sorting BAM file by coordinates" + echo "samtools sort -@ $3 -o positionsorted.bam fixed.bam" + samtools sort -@ $3 -o positionsorted.bam fixed.bam + + if [ "$1" = "true" ] + then + echo "Only marking PCR duplicates" + echo "samtools markdup -c -s -@ $3 positionsorted.bam markduped.bam" + samtools markdup -c -s -@ $3 positionsorted.bam markduped.bam 2> markdup_report.tsv + else + echo "Removing PCR duplicates" + echo "samtools markdup -c -r -s -@ $3 positionsorted.bam markduped.bam" + samtools markdup -c -r -s -@ $3 positionsorted.bam markduped.bam 2> markdup_report.tsv + fi + + echo "Sorting BAM file" + echo "samtools sort -@ $3 markduped.bam -o $2" + samtools sort -@ $3 markduped.bam -o $2 + + echo "Indexing BAM file" + echo "samtools index $2" + samtools index $2 + + echo "Removing temporary files" + rm -f namesorted.bam fixed.bam positionsorted.bam markduped.bam temp.bam* + + inputBinding: + position: 5 + doc: "Script to remove PCR duplicates" + + bam_bai_pair: + type: File + inputBinding: + position: 6 + doc: BAM (optionally BAI) files + + keep_duplicates: + type: boolean? + default: false # somehow when omitted, valueFrom is not evaluated + inputBinding: + position: 7 + valueFrom: $(self?"true":"false") + doc: | + If true duplicates will be only + marked, oterwise - removed + + output_filename: + type: string? + inputBinding: + position: 8 + valueFrom: | + ${ + return (self == "")?inputs.bam_bai_pair.basename:self; + } + default: "" + doc: "Output filename for the filtered BAM file" + + threads: + type: int? + inputBinding: + position: 9 + default: 1 + doc: "Number of threads to use" + + +outputs: + + deduplicated_bam_bai_pair: + type: File + outputBinding: + glob: "*.bam" + secondaryFiles: + - .bai + doc: "BAM+BAI files with PCR duplicates removed" + + markdup_report: + type: File + outputBinding: + glob: "markdup_report.tsv" + doc: "Markdup report" + + +baseCommand: [bash, '-c'] + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +s:mainEntity: + $import: ./metadata/samtools-metadata.yaml + +s:name: "samtools-markdup" +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/samtools-markdup.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + +doc: | + Removes or only marks PCR duplicates from + coordinate sorted and indexed BAM file. + Returns coordinate sorted and indexed BAM + files. Stages input bam_bai_pair to workdir. + Otherwise samtools sort fails. + +s:about: | + Removes or only marks PCR duplicates from + coordinate sorted and indexed BAM file. + Returns coordinate sorted and indexed BAM + files. Stages input bam_bai_pair to workdir. + Otherwise samtools sort fails. \ No newline at end of file diff --git a/workflows/chipseq-pe.cwl b/workflows/chipseq-pe.cwl index 86d01f12..1c5c2e65 100644 --- a/workflows/chipseq-pe.cwl +++ b/workflows/chipseq-pe.cwl @@ -64,14 +64,20 @@ inputs: doc: "Set to call broad peak for MACS2" fastq_file_upstream: - type: File - label: "FASTQ 1 input file" + type: + - File + - type: array + items: File + label: "FASTQ 1 input file(s)" format: "http://edamontology.org/format_1930" doc: "Reads data in a FASTQ format, received after paired end sequencing" fastq_file_downstream: - type: File - label: "FASTQ 2 input file" + type: + - File + - type: array + items: File + label: "FASTQ 2 input file(s)" format: "http://edamontology.org/format_1930" doc: "Reads data in a FASTQ format, received after paired end sequencing" @@ -115,6 +121,16 @@ inputs: label: "Remove duplicates" doc: "Calls samtools rmdup to remove duplicates from sortesd BAM file" + peak_calling_fdr: + type: float? + default: 0.05 + 'sd:layout': + advanced: true + label: "Minimum FDR (q-value) cutoff for peak detection" + doc: | + Minimum FDR (q-value) cutoff for peak detection. -q, and + -p are mutually exclusive. + promoter_dist: type: int? default: 1000 @@ -266,19 +282,12 @@ outputs: data: [$1, $2] comparable: "atdp" - samtools_rmdup_log: - type: File - label: "Remove duplicates log" - format: "http://edamontology.org/format_2330" - doc: "Samtools rmdup generated log" - outputSource: samtools_rmdup/rmdup_log - bambai_pair: type: File format: "http://edamontology.org/format_2572" label: "Coordinate sorted BAM alignment file (+index BAI)" doc: "Coordinate sorted BAM file and BAI index file" - outputSource: samtools_sort_index_after_rmdup/bam_bai_pair + outputSource: samtools_remove_duplicates/deduplicated_bam_bai_pair 'sd:visualPlugins': - igvbrowser: tab: 'IGV Genome Browser' @@ -308,8 +317,7 @@ outputs: id: 'igvbrowser' type: 'annotation' name: "Narrow peaks" - displayMode: "COLLAPSE" - height: 40 + height: 120 macs2_broad_peaks: type: File? @@ -323,8 +331,7 @@ outputs: id: 'igvbrowser' type: 'annotation' name: "Broad peaks" - displayMode: "COLLAPSE" - height: 40 + height: 120 macs2_peak_summits: type: File? @@ -352,8 +359,7 @@ outputs: id: 'igvbrowser' type: 'annotation' name: "Gapped peaks" - displayMode: "COLLAPSE" - height: 40 + height: 120 macs2_log: type: File? @@ -433,21 +439,22 @@ outputs: doc: "fragment, calculated fragment, islands count from MACS2 results" outputSource: macs2_callpeak/macs2_stat_file - preseq_estimates_plot_data: + preseq_estimates: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq_plot_data/estimates_file_plot_data + outputSource: preseq/estimates_file 'sd:visualPlugins': - - line: + - scatter: tab: 'QC Plots' - Title: 'Distinct Read Counts Estimates' - xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' - yAxisTitle: 'Distinct Reads Count' - colors: ["#4b78a3", "#a3514b"] + Title: 'Preseq Estimates' + xAxisTitle: 'Total reads count' + yAxisTitle: 'Expected distinct reads count' + colors: ["#4b78a3"] height: 500 - data: [$2, $5] + data: [$1, $2] + comparable: "preseq" estimated_fragment_size: type: int @@ -472,9 +479,9 @@ steps: the core facility by providing a URL or from GEO by providing SRA accession number. run: ../tools/extract-fastq.cwl in: - output_prefix: - default: "read_1" compressed_file: fastq_file_upstream + output_prefix: + default: "read_1" out: [fastq_file] extract_fastq_downstream: @@ -485,9 +492,9 @@ steps: the core facility by providing a URL or from GEO by providing SRA accession number. run: ../tools/extract-fastq.cwl in: - output_prefix: - default: "read_2" compressed_file: fastq_file_downstream + output_prefix: + default: "read_2" out: [fastq_file] fastx_quality_stats_upstream: @@ -515,8 +522,11 @@ steps: bowtie_aligner: label: "Alignment to reference genome" doc: | - Aligns reads to the reference genome keeping only uniquely mapped reads with - less than 3 mismatches. + Aligns reads to the reference genome. + Reads are assumed to be mapped if they + have less than 3 mismatches. + sam_file output includes both mapped + and unmapped reads. run: ../tools/bowtie-alignreads.cwl in: upstream_filelist: extract_fastq_upstream/fastq_file @@ -552,11 +562,14 @@ steps: threads: threads out: [bam_bai_pair] - clean_sam_headers_for_preseq: - run: ../tools/samtools-clean-headers.cwl + samtools_mark_duplicates: + run: ../tools/samtools-markdup.cwl in: - bam_file: samtools_sort_index/bam_bai_pair - out: [preseq_bam] + bam_bai_pair: samtools_sort_index/bam_bai_pair + keep_duplicates: + default: true + threads: threads + out: [deduplicated_bam_bai_pair] preseq: label: "Sequencing depth estimation" @@ -565,32 +578,22 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: clean_sam_headers_for_preseq/preseq_bam + bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair pe_mode: default: true extrapolation: default: 1000000000 - out: [estimates_file, log_file_stdout, log_file_stderr] - - samtools_rmdup: - label: "PCR duplicates removal" - doc: | - Removes potential PCR duplicates. This step is used to remove reads overamplified - in PCR. Unfortunately, it may also remove "good" reads. We do not recommend to - remove duplicates unless the library is heavily duplicated. - run: ../tools/samtools-rmdup.cwl - in: - trigger: remove_duplicates - bam_file: samtools_sort_index/bam_bai_pair - out: [rmdup_output, rmdup_log] + out: [estimates_file] - samtools_sort_index_after_rmdup: - run: ../tools/samtools-sort-index.cwl + samtools_remove_duplicates: + run: ../tools/samtools-markdup.cwl in: - trigger: remove_duplicates - sort_input: samtools_rmdup/rmdup_output + bam_bai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair + keep_duplicates: + source: remove_duplicates + valueFrom: $(!self) threads: threads - out: [bam_bai_pair] + out: [deduplicated_bam_bai_pair] macs2_callpeak: label: "Peak detection" @@ -599,7 +602,7 @@ steps: transcription factor binding sites. run: ../tools/macs2-callpeak-biowardrobe-only.cwl in: - treatment_file: samtools_sort_index_after_rmdup/bam_bai_pair + treatment_file: samtools_remove_duplicates/deduplicated_bam_bai_pair control_file: control_file nolambda: source: control_file @@ -618,8 +621,7 @@ steps: valueFrom: $(!self) keep_dup: default: auto - q_value: - default: 0.05 + q_value: peak_calling_fdr format_mode: default: BAMPE buffer_size: @@ -640,7 +642,7 @@ steps: bam_to_bigwig: run: ../tools/bam-bedgraph-bigwig.cwl in: - bam_file: samtools_sort_index_after_rmdup/bam_bai_pair + bam_file: samtools_remove_duplicates/deduplicated_bam_bai_pair chrom_length_file: chrom_length mapped_reads_number: get_stat/mapped_reads pairchip: @@ -654,20 +656,20 @@ steps: read length and quality score, etc. run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index/bam_bai_pair + bambai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index/bam_bai_pair + source: samtools_mark_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") out: [log_file] get_bam_statistics_after_filtering: run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index_after_rmdup/bam_bai_pair + bambai_pair: samtools_remove_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index_after_rmdup/bam_bai_pair + source: samtools_remove_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report_after_filtering.txt") - out: [log_file, ext_is_section] + out: [log_file, ext_is_section, reads_mapped] get_stat: run: ../tools/collect-statistics-chip-seq.cwl @@ -676,23 +678,12 @@ steps: bam_statistics_report: get_bam_statistics/log_file bam_statistics_after_filtering_report: get_bam_statistics_after_filtering/log_file macs2_called_peaks: macs2_callpeak/peak_xls_file + atdp_results: average_tag_density/result_file preseq_results: preseq/estimates_file paired_end: default: True out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] - preseq_plot_data: - label: "Formats sequencing depth estimation data for plotting" - doc: | - Formats estimates file from preseq standard output for QC plotting. This adds a new - column that includes the actual read count point on the plot. - run: ../tools/preseq-plot-data.cwl - in: - preseq_stderr_log_file: preseq/log_file_stderr - estimates_file: preseq/estimates_file - mapped_reads: get_stat/mapped_reads - out: [estimates_file_plot_data] - island_intersect: label: "Peak annotation" doc: | @@ -713,7 +704,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_sort_index_after_rmdup/bam_bai_pair + input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: @@ -726,7 +717,9 @@ steps: default: "chrX chrY" avd_heat_window_bp: default: 200 - mapped_reads: get_stat/mapped_reads + mapped_reads: + source: get_bam_statistics_after_filtering/reads_mapped + valueFrom: $(parseInt(self/2)) out: [result_file, log_file] @@ -736,8 +729,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Deprecated. ChIP-Seq pipeline paired-end" -s:name: "Deprecated. ChIP-Seq pipeline paired-end" +label: "ChIP-Seq pipeline paired-end" +s:name: "ChIP-Seq pipeline paired-end" s:alternateName: "ChIP-Seq basic analysis workflow for a paired-end experiment" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/chipseq-pe.cwl @@ -797,9 +790,7 @@ doc: | *samtools\_sort\_index*. Depending on workflow’s input parameters indexed and sorted BAM file - can be processed by `samtools rmdup` *samtools\_rmdup* to get rid of duplicated reads. - If removing duplicates is not required the original BAM and BAI - files are returned. Otherwise step *samtools\_sort\_index\_after\_rmdup* repeat `samtools sort` and `samtools index` with BAM and BAI files without duplicates. + can be processed by `samtools markdup` *samtools\_remove\_duplicates* to get rid of duplicated reads. Next `macs2 callpeak` performs peak calling *macs2\_callpeak* and the next step reports *macs2\_island\_count* the number of islands and estimated fragment size. If the latter diff --git a/workflows/chipseq-se.cwl b/workflows/chipseq-se.cwl index d3409eaa..466f2aaf 100644 --- a/workflows/chipseq-se.cwl +++ b/workflows/chipseq-se.cwl @@ -66,8 +66,11 @@ inputs: doc: "Make MACS2 call broad peaks by linking nearby highly enriched regions" fastq_file: - type: File - label: "FASTQ file" + type: + - File + - type: array + items: File + label: "FASTQ file(s)" format: "http://edamontology.org/format_1930" doc: "Single-read sequencing data in FASTQ format (fastq, fq, bzip2, gzip, zip)" @@ -111,6 +114,16 @@ inputs: label: "Remove PCR duplicates" doc: "Remove PCR duplicates from sorted BAM file" + peak_calling_fdr: + type: float? + default: 0.05 + 'sd:layout': + advanced: true + label: "Minimum FDR (q-value) cutoff for peak detection" + doc: | + Minimum FDR (q-value) cutoff for peak detection. -q, and + -p are mutually exclusive. + promoter_dist: type: int? default: 1000 @@ -233,7 +246,7 @@ outputs: format: "http://edamontology.org/format_2572" label: "Aligned reads" doc: "Coordinate sorted BAM alignment and index BAI files" - outputSource: samtools_sort_index_after_rmdup/bam_bai_pair + outputSource: samtools_remove_duplicates/deduplicated_bam_bai_pair 'sd:visualPlugins': - igvbrowser: tab: 'IGV Genome Browser' @@ -262,8 +275,7 @@ outputs: id: 'igvbrowser' type: 'annotation' name: "Narrow peaks" - displayMode: "COLLAPSE" - height: 40 + height: 120 macs2_broad_peaks: type: File? @@ -277,22 +289,21 @@ outputs: id: 'igvbrowser' type: 'annotation' name: "Broad peaks" - displayMode: "COLLAPSE" - height: 40 + height: 120 workflow_statistics_yaml: type: File? label: "YAML formatted combined log" format: "http://edamontology.org/format_3750" doc: "YAML formatted combined log" - outputSource: get_stat/collected_statistics_yaml + outputSource: get_statistics/collected_statistics_yaml workflow_statistics_markdown: type: File? label: "Markdown formatted combined log" format: "http://edamontology.org/format_3835" doc: "Markdown formatted combined log" - outputSource: get_stat/collected_statistics_md + outputSource: get_statistics/collected_statistics_md 'sd:visualPlugins': - markdownView: tab: 'Overview' @@ -302,7 +313,7 @@ outputs: label: "Workflow execution statistics" format: "http://edamontology.org/format_3475" doc: "Overall workflow execution statistics from bowtie_aligner and samtools_rmdup steps" - outputSource: get_stat/collected_statistics_tsv + outputSource: get_statistics/collected_statistics_tsv 'sd:visualPlugins': - tableView: vertical: true @@ -327,21 +338,22 @@ outputs: doc: "BAM statistics report (after all filters applied)" outputSource: get_bam_statistics_after_filtering/log_file - preseq_estimates_plot_data: + preseq_estimates: type: File? - label: "Preseq estimates" + label: "Expected Distinct Reads Count Plot" format: "http://edamontology.org/format_3475" - doc: "Preseq estimated results" - outputSource: preseq_plot_data/estimates_file_plot_data + doc: "Expected distinct reads count file from Preseq in TSV format" + outputSource: preseq/estimates_file 'sd:visualPlugins': - - line: + - scatter: tab: 'QC Plots' - Title: 'Distinct Read Counts Estimates' - xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' - yAxisTitle: 'Distinct Reads Count' - colors: ["#4b78a3", "#a3514b"] + Title: 'Expected Distinct Reads Count Plot' + xAxisTitle: 'Total reads count' + yAxisTitle: 'Expected distinct reads count' + colors: ["#4b78a3"] height: 500 - data: [$2, $5] + data: [$1, $2] + comparable: "preseq" estimated_fragment_size: type: int @@ -353,7 +365,7 @@ outputs: type: int label: "Mapped reads number" doc: "Mapped reads number for downstream analyses" - outputSource: get_stat/mapped_reads + outputSource: get_statistics/mapped_reads steps: @@ -367,6 +379,8 @@ steps: run: ../tools/extract-fastq.cwl in: compressed_file: fastq_file + output_prefix: + default: "read_1" out: [fastq_file] fastx_quality_stats: @@ -383,8 +397,11 @@ steps: bowtie_aligner: label: "Alignment to reference genome" doc: | - Aligns reads to the reference genome keeping only uniquely mapped reads with - less than 3 mismatches. + Aligns reads to the reference genome. + Reads are assumed to be mapped if they + have less than 3 mismatches. + sam_file output includes both mapped + and unmapped reads. run: ../tools/bowtie-alignreads.cwl in: upstream_filelist: extract_fastq/fastq_file @@ -423,11 +440,14 @@ steps: threads: threads out: [bam_bai_pair] - clean_sam_headers_for_preseq: - run: ../tools/samtools-clean-headers.cwl + samtools_mark_duplicates: + run: ../tools/samtools-markdup.cwl in: - bam_file: samtools_sort_index/bam_bai_pair - out: [preseq_bam] + bam_bai_pair: samtools_sort_index/bam_bai_pair + keep_duplicates: + default: true + threads: threads + out: [deduplicated_bam_bai_pair] preseq: label: "Sequencing depth estimation" @@ -436,34 +456,20 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: clean_sam_headers_for_preseq/preseq_bam + bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair extrapolation: default: 1000000000 - out: [estimates_file, log_file_stdout, log_file_stderr] + out: [estimates_file] - samtools_rmdup: - label: "PCR duplicates removal" - doc: | - Removes potential PCR duplicates. This step is used to remove reads overamplified - in PCR. Unfortunately, it may also remove "good" reads. We do not recommend to - remove duplicates unless the library is heavily duplicated. - run: ../tools/samtools-rmdup.cwl - in: - trigger: remove_duplicates - bam_file: samtools_sort_index/bam_bai_pair - single_end: - default: true - out: - - rmdup_output - - rmdup_log - - samtools_sort_index_after_rmdup: - run: ../tools/samtools-sort-index.cwl + samtools_remove_duplicates: + run: ../tools/samtools-markdup.cwl in: - trigger: remove_duplicates - sort_input: samtools_rmdup/rmdup_output + bam_bai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair + keep_duplicates: + source: remove_duplicates + valueFrom: $(!self) threads: threads - out: [bam_bai_pair] + out: [deduplicated_bam_bai_pair] macs2_callpeak: label: "Peak detection" @@ -472,7 +478,7 @@ steps: transcription factor binding sites. run: ../tools/macs2-callpeak-biowardrobe-only.cwl in: - treatment_file: samtools_sort_index_after_rmdup/bam_bai_pair + treatment_file: samtools_remove_duplicates/deduplicated_bam_bai_pair control_file: control_file nolambda: source: control_file @@ -491,8 +497,7 @@ steps: valueFrom: $(!self) keep_dup: default: auto - q_value: - default: 0.05 + q_value: peak_calling_fdr format_mode: default: BAM buffer_size: @@ -506,9 +511,9 @@ steps: bam_to_bigwig: run: ../tools/bam-bedgraph-bigwig.cwl in: - bam_file: samtools_sort_index_after_rmdup/bam_bai_pair + bam_file: samtools_remove_duplicates/deduplicated_bam_bai_pair chrom_length_file: chrom_length - mapped_reads_number: get_stat/mapped_reads + mapped_reads_number: get_statistics/mapped_reads fragment_size: macs2_callpeak/macs2_fragments_calculated out: [bigwig_file] @@ -519,43 +524,32 @@ steps: read length and quality score, etc. run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index/bam_bai_pair + bambai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index/bam_bai_pair + source: samtools_mark_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") out: [log_file] get_bam_statistics_after_filtering: run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index_after_rmdup/bam_bai_pair + bambai_pair: samtools_remove_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index_after_rmdup/bam_bai_pair + source: samtools_remove_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report_after_filtering.txt") - out: [log_file] + out: [log_file, reads_mapped] - get_stat: + get_statistics: run: ../tools/collect-statistics-chip-seq.cwl in: bowtie_alignment_report: bowtie_aligner/log_file bam_statistics_report: get_bam_statistics/log_file bam_statistics_after_filtering_report: get_bam_statistics_after_filtering/log_file macs2_called_peaks: macs2_callpeak/peak_xls_file + atdp_results: average_tag_density/result_file preseq_results: preseq/estimates_file out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] - preseq_plot_data: - label: "Formats sequencing depth estimation data for plotting" - doc: | - Formats estimates file from preseq standard output for QC plotting. This adds a new - column that includes the actual read count point on the plot. - run: ../tools/preseq-plot-data.cwl - in: - preseq_stderr_log_file: preseq/log_file_stderr - estimates_file: preseq/estimates_file - mapped_reads: get_stat/mapped_reads - out: [estimates_file_plot_data] - island_intersect: label: "Peak annotation" doc: | @@ -576,7 +570,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_sort_index_after_rmdup/bam_bai_pair + input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: @@ -589,7 +583,9 @@ steps: default: "chrX chrY" avd_heat_window_bp: default: 200 - mapped_reads: get_stat/mapped_reads + mapped_reads: + source: get_bam_statistics_after_filtering/reads_mapped + valueFrom: $(parseInt(self)) out: [result_file] @@ -599,8 +595,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Deprecated. ChIP-Seq pipeline single-read" -s:name: "Deprecated. ChIP-Seq pipeline single-read" +label: "ChIP-Seq pipeline single-read" +s:name: "ChIP-Seq pipeline single-read" s:alternateName: "ChIP-Seq basic analysis workflow for single-read data" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/chipseq-se.cwl diff --git a/workflows/trim-atacseq-pe.cwl b/workflows/trim-atacseq-pe.cwl index 73e0ab90..ee59db3d 100644 --- a/workflows/trim-atacseq-pe.cwl +++ b/workflows/trim-atacseq-pe.cwl @@ -68,7 +68,7 @@ inputs: - File - type: array items: File - label: "FASTQ read 1 input file" + label: "FASTQ read 1 input file(s)" format: "http://edamontology.org/format_1930" doc: "Reads data in a FASTQ format, received after paired end sequencing" @@ -77,7 +77,7 @@ inputs: - File - type: array items: File - label: "FASTQ read 2 input file" + label: "FASTQ read 2 input file(s)" format: "http://edamontology.org/format_1930" doc: "Reads data in a FASTQ format, received after paired end sequencing" @@ -121,6 +121,16 @@ inputs: label: "Remove duplicates" doc: "Calls samtools rmdup to remove duplicates from sortesd BAM file" + peak_calling_fdr: + type: float? + default: 0.05 + 'sd:layout': + advanced: true + label: "Minimum FDR (q-value) cutoff for peak detection" + doc: | + Minimum FDR (q-value) cutoff for peak detection. -q, and + -p are mutually exclusive. + exclude_chromosome: type: string? default: "chrM" @@ -129,6 +139,14 @@ inputs: label: "Exclude chromosomes" doc: "Space separated list of chromosomes to be excluded" + do_not_scale: + type: boolean? + default: false + 'sd:layout': + advanced: true + label: "Do not scale genome coverage based on mapped reads number" + doc: "When converting BAM to bigWig, the scale will be set to 1 by default" + promoter_dist: type: int? default: 1000 @@ -280,19 +298,12 @@ outputs: data: [$1, $2] comparable: "atdp" - samtools_rmdup_log: - type: File - label: "Remove duplicates log" - format: "http://edamontology.org/format_2330" - doc: "Samtools rmdup generated log" - outputSource: samtools_rmdup/rmdup_log - bambai_pair: type: File format: "http://edamontology.org/format_2572" label: "Coordinate sorted BAM alignment file (+index BAI)" doc: "Coordinate sorted BAM file and BAI index file" - outputSource: samtools_sort_index_after_rmdup/bam_bai_pair + outputSource: samtools_remove_duplicates/deduplicated_bam_bai_pair 'sd:visualPlugins': - igvbrowser: tab: 'IGV Genome Browser' @@ -459,21 +470,22 @@ outputs: doc: "TrimGalore generated log for FASTQ 2" outputSource: trim_fastq/report_file_pair - preseq_estimates_plot_data: + preseq_estimates: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq_plot_data/estimates_file_plot_data + outputSource: preseq/estimates_file 'sd:visualPlugins': - - line: + - scatter: tab: 'QC Plots' - Title: 'Distinct Read Counts Estimates' - xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' - yAxisTitle: 'Distinct Reads Count' - colors: ["#4b78a3", "#a3514b"] + Title: 'Preseq Estimates' + xAxisTitle: 'Total reads count' + yAxisTitle: 'Expected distinct reads count' + colors: ["#4b78a3"] height: 500 - data: [$2, $5] + data: [$1, $2] + comparable: "preseq" estimated_fragment_size: type: int @@ -497,9 +509,9 @@ steps: the core facility by providing a URL or from GEO by providing SRA accession number. run: ../tools/extract-fastq.cwl in: - output_prefix: - default: "read_1" compressed_file: fastq_file_upstream + output_prefix: + default: "read_1" out: [fastq_file] extract_fastq_downstream: @@ -510,9 +522,9 @@ steps: the core facility by providing a URL or from GEO by providing SRA accession number. run: ../tools/extract-fastq.cwl in: - output_prefix: - default: "read_2" compressed_file: fastq_file_downstream + output_prefix: + default: "read_2" out: [fastq_file] trim_fastq: @@ -603,8 +615,11 @@ steps: bowtie_aligner: label: "Alignment to reference genome" doc: | - Aligns reads to the reference genome keeping only uniquely mapped reads with - less than 3 mismatches. + Aligns reads to the reference genome. + Reads are assumed to be mapped if they + have less than 3 mismatches. + sam_file output includes both mapped + and unmapped reads. run: ../tools/bowtie-alignreads.cwl in: upstream_filelist: rename_upstream/target_file @@ -640,19 +655,22 @@ steps: threads: threads out: [bam_bai_pair] + samtools_mark_duplicates: + run: ../tools/samtools-markdup.cwl + in: + bam_bai_pair: samtools_sort_index/bam_bai_pair + keep_duplicates: + default: true + threads: threads + out: [deduplicated_bam_bai_pair] + filter_bam: run: ../tools/samtools-filter.cwl in: - bam_bai_pair: samtools_sort_index/bam_bai_pair + bam_bai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair exclude_chromosome: exclude_chromosome out: [filtered_bam_bai_pair] - clean_sam_headers_for_preseq: - run: ../tools/samtools-clean-headers.cwl - in: - bam_file: filter_bam/filtered_bam_bai_pair - out: [preseq_bam] - preseq: label: "Sequencing depth estimation" doc: | @@ -660,32 +678,22 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: clean_sam_headers_for_preseq/preseq_bam + bam_file: filter_bam/filtered_bam_bai_pair pe_mode: default: true extrapolation: default: 1000000000 - out: [estimates_file, log_file_stdout, log_file_stderr] + out: [estimates_file] - samtools_rmdup: - label: "PCR duplicates removal" - doc: | - Removes potential PCR duplicates. This step is used to remove reads overamplified - in PCR. Unfortunately, it may also remove "good" reads. We do not recommend to - remove duplicates unless the library is heavily duplicated. - run: ../tools/samtools-rmdup.cwl - in: - trigger: remove_duplicates - bam_file: filter_bam/filtered_bam_bai_pair - out: [rmdup_output, rmdup_log] - - samtools_sort_index_after_rmdup: - run: ../tools/samtools-sort-index.cwl + samtools_remove_duplicates: + run: ../tools/samtools-markdup.cwl in: - trigger: remove_duplicates - sort_input: samtools_rmdup/rmdup_output + bam_bai_pair: filter_bam/filtered_bam_bai_pair + keep_duplicates: + source: remove_duplicates + valueFrom: $(!self) threads: threads - out: [bam_bai_pair] + out: [deduplicated_bam_bai_pair] macs2_callpeak: label: "Peak detection" @@ -694,7 +702,7 @@ steps: transcription factor binding sites. run: ../tools/macs2-callpeak-biowardrobe-only.cwl in: - treatment_file: samtools_sort_index_after_rmdup/bam_bai_pair + treatment_file: samtools_remove_duplicates/deduplicated_bam_bai_pair control_file: control_file nolambda: source: control_file @@ -713,8 +721,7 @@ steps: valueFrom: $(!self) keep_dup: default: auto - q_value: - default: 0.05 + q_value: peak_calling_fdr format_mode: default: BAMPE buffer_size: @@ -735,9 +742,11 @@ steps: bam_to_bigwig: run: ../tools/bam-bedgraph-bigwig.cwl in: - bam_file: samtools_sort_index_after_rmdup/bam_bai_pair + bam_file: samtools_remove_duplicates/deduplicated_bam_bai_pair chrom_length_file: chrom_length - mapped_reads_number: get_stat/mapped_reads + mapped_reads_number: + source: [do_not_scale, get_stat/mapped_reads] + valueFrom: $(self[0]?null:self[1]) fragment_size: default: 9 out: [bigwig_file] @@ -749,20 +758,20 @@ steps: read length and quality score, etc. run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index/bam_bai_pair + bambai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index/bam_bai_pair + source: samtools_mark_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") out: [log_file] get_bam_statistics_after_filtering: run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index_after_rmdup/bam_bai_pair + bambai_pair: samtools_remove_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index_after_rmdup/bam_bai_pair + source: samtools_remove_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report_after_filtering.txt") - out: [log_file, ext_is_section] + out: [log_file, ext_is_section, reads_mapped] get_stat: run: ../tools/collect-statistics-chip-seq.cwl @@ -773,23 +782,12 @@ steps: bam_statistics_report: get_bam_statistics/log_file bam_statistics_after_filtering_report: get_bam_statistics_after_filtering/log_file macs2_called_peaks: macs2_callpeak/peak_xls_file + atdp_results: average_tag_density/result_file preseq_results: preseq/estimates_file paired_end: default: True out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] - preseq_plot_data: - label: "Formats sequencing depth estimation data for plotting" - doc: | - Formats estimates file from preseq standard output for QC plotting. This adds a new - column that includes the actual read count point on the plot. - run: ../tools/preseq-plot-data.cwl - in: - preseq_stderr_log_file: preseq/log_file_stderr - estimates_file: preseq/estimates_file - mapped_reads: get_stat/mapped_reads - out: [estimates_file_plot_data] - island_intersect: label: "Peak annotation" doc: | @@ -810,7 +808,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_sort_index_after_rmdup/bam_bai_pair + input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: @@ -823,7 +821,9 @@ steps: default: "chrX chrY" avd_heat_window_bp: default: 200 - mapped_reads: get_stat/mapped_reads + mapped_reads: + source: get_bam_statistics_after_filtering/reads_mapped + valueFrom: $(parseInt(self/2)) out: [result_file, log_file] @@ -913,11 +913,8 @@ doc: | files to reference genome (Step bowtie_aligner). The output of this step is unsorted SAM file which is being sorted and indexed by samtools sort and samtools index (Step samtools_sort_index). Depending on workflow’s input parameters indexed and sorted BAM file - could be processed by samtools rmdup (Step samtools_rmdup) to remove all possible read duplicates. - In a case when removing duplicates is not necessary the step returns original input BAM and BAI - files without any processing. If the duplicates were removed the following step - (Step samtools_sort_index_after_rmdup) reruns samtools sort and samtools index with BAM and BAI files, - if not - the step returns original unchanged input files. Right after that macs2 callpeak performs + could be processed by `samtools markdup` *samtools\_remove\_duplicates* to remove all possible read duplicates. + Right after that macs2 callpeak performs peak calling (Step macs2_callpeak). On the base of returned outputs the next step (Step macs2_island_count) calculates the number of islands and estimated fragment size. If the last one is less that 80 (hardcoded in a workflow) macs2 callpeak is rerun again with forced fixed diff --git a/workflows/trim-atacseq-se.cwl b/workflows/trim-atacseq-se.cwl index 4c4d0901..ca85f219 100644 --- a/workflows/trim-atacseq-se.cwl +++ b/workflows/trim-atacseq-se.cwl @@ -67,7 +67,7 @@ inputs: - File - type: array items: File - label: "FASTQ input file" + label: "FASTQ input file(s)" format: "http://edamontology.org/format_1930" doc: "Reads data in a FASTQ format, received after single end sequencing" @@ -111,6 +111,16 @@ inputs: label: "Remove duplicates" doc: "Calls samtools rmdup to remove duplicates from sortesd BAM file" + peak_calling_fdr: + type: float? + default: 0.05 + 'sd:layout': + advanced: true + label: "Minimum FDR (q-value) cutoff for peak detection" + doc: | + Minimum FDR (q-value) cutoff for peak detection. -q, and + -p are mutually exclusive. + exclude_chromosome: type: string? default: "chrM" @@ -119,6 +129,14 @@ inputs: label: "Exclude chromosomes" doc: "Space separated list of chromosomes to be excluded" + do_not_scale: + type: boolean? + default: false + 'sd:layout': + advanced: true + label: "Do not scale genome coverage based on mapped reads number" + doc: "When converting BAM to bigWig, the scale will be set to 1 by default" + promoter_dist: type: int? default: 1000 @@ -248,19 +266,12 @@ outputs: data: [$1, $2] comparable: "atdp" - samtools_rmdup_log: - type: File - label: "Remove duplicates log" - format: "http://edamontology.org/format_2330" - doc: "Samtools rmdup generated log" - outputSource: samtools_rmdup/rmdup_log - bambai_pair: type: File format: "http://edamontology.org/format_2572" label: "Coordinate sorted BAM alignment file (+index BAI)" doc: "Coordinate sorted BAM file and BAI index file" - outputSource: samtools_sort_index_after_rmdup/bam_bai_pair + outputSource: samtools_remove_duplicates/deduplicated_bam_bai_pair 'sd:visualPlugins': - igvbrowser: tab: 'IGV Genome Browser' @@ -404,21 +415,22 @@ outputs: doc: "TrimGalore generated log" outputSource: trim_fastq/report_file - preseq_estimates_plot_data: + preseq_estimates: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq_plot_data/estimates_file_plot_data + outputSource: preseq/estimates_file 'sd:visualPlugins': - - line: + - scatter: tab: 'QC Plots' - Title: 'Distinct Read Counts Estimates' - xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' - yAxisTitle: 'Distinct Reads Count' - colors: ["#4b78a3", "#a3514b"] + Title: 'Preseq Estimates' + xAxisTitle: 'Total reads count' + yAxisTitle: 'Expected distinct reads count' + colors: ["#4b78a3"] height: 500 - data: [$2, $5] + data: [$1, $2] + comparable: "preseq" estimated_fragment_size: type: int @@ -443,6 +455,8 @@ steps: run: ../tools/extract-fastq.cwl in: compressed_file: fastq_file + output_prefix: + default: "read_1" out: [fastq_file] trim_fastq: @@ -500,8 +514,11 @@ steps: bowtie_aligner: label: "Alignment to reference genome" doc: | - Aligns reads to the reference genome keeping only uniquely mapped reads with - less than 3 mismatches. + Aligns reads to the reference genome. + Reads are assumed to be mapped if they + have less than 3 mismatches. + sam_file output includes both mapped + and unmapped reads. run: ../tools/bowtie-alignreads.cwl in: upstream_filelist: rename/target_file @@ -536,19 +553,22 @@ steps: threads: threads out: [bam_bai_pair] + samtools_mark_duplicates: + run: ../tools/samtools-markdup.cwl + in: + bam_bai_pair: samtools_sort_index/bam_bai_pair + keep_duplicates: + default: true + threads: threads + out: [deduplicated_bam_bai_pair] + filter_bam: run: ../tools/samtools-filter.cwl in: - bam_bai_pair: samtools_sort_index/bam_bai_pair + bam_bai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair exclude_chromosome: exclude_chromosome out: [filtered_bam_bai_pair] - clean_sam_headers_for_preseq: - run: ../tools/samtools-clean-headers.cwl - in: - bam_file: filter_bam/filtered_bam_bai_pair - out: [preseq_bam] - preseq: label: "Sequencing depth estimation" doc: | @@ -556,32 +576,20 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: clean_sam_headers_for_preseq/preseq_bam + bam_file: filter_bam/filtered_bam_bai_pair extrapolation: default: 1000000000 - out: [estimates_file, log_file_stdout, log_file_stderr] - - samtools_rmdup: - label: "PCR duplicates removal" - doc: | - Removes potential PCR duplicates. This step is used to remove reads overamplified - in PCR. Unfortunately, it may also remove "good" reads. We do not recommend to - remove duplicates unless the library is heavily duplicated. - run: ../tools/samtools-rmdup.cwl - in: - trigger: remove_duplicates - bam_file: filter_bam/filtered_bam_bai_pair - single_end: - default: true - out: [rmdup_output, rmdup_log] + out: [estimates_file] - samtools_sort_index_after_rmdup: - run: ../tools/samtools-sort-index.cwl + samtools_remove_duplicates: + run: ../tools/samtools-markdup.cwl in: - trigger: remove_duplicates - sort_input: samtools_rmdup/rmdup_output + bam_bai_pair: filter_bam/filtered_bam_bai_pair + keep_duplicates: + source: remove_duplicates + valueFrom: $(!self) threads: threads - out: [bam_bai_pair] + out: [deduplicated_bam_bai_pair] macs2_callpeak: label: "Peak detection" @@ -590,7 +598,7 @@ steps: transcription factor binding sites. run: ../tools/macs2-callpeak-biowardrobe-only.cwl in: - treatment_file: samtools_sort_index_after_rmdup/bam_bai_pair + treatment_file: samtools_remove_duplicates/deduplicated_bam_bai_pair control_file: control_file nolambda: source: control_file @@ -609,8 +617,7 @@ steps: valueFrom: $(!self) keep_dup: default: auto - q_value: - default: 0.05 + q_value: peak_calling_fdr format_mode: default: BAM buffer_size: @@ -631,9 +638,11 @@ steps: bam_to_bigwig: run: ../tools/bam-bedgraph-bigwig.cwl in: - bam_file: samtools_sort_index_after_rmdup/bam_bai_pair + bam_file: samtools_remove_duplicates/deduplicated_bam_bai_pair chrom_length_file: chrom_length - mapped_reads_number: get_stat/mapped_reads + mapped_reads_number: + source: [do_not_scale, get_stat/mapped_reads] + valueFrom: $(self[0]?null:self[1]) fragment_size: default: 9 out: [bigwig_file] @@ -645,20 +654,20 @@ steps: read length and quality score, etc. run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index/bam_bai_pair + bambai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index/bam_bai_pair + source: samtools_mark_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") out: [log_file] get_bam_statistics_after_filtering: run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index_after_rmdup/bam_bai_pair + bambai_pair: samtools_remove_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index_after_rmdup/bam_bai_pair + source: samtools_remove_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report_after_filtering.txt") - out: [log_file] + out: [log_file, reads_mapped] get_stat: run: ../tools/collect-statistics-chip-seq.cwl @@ -668,21 +677,10 @@ steps: bam_statistics_report: get_bam_statistics/log_file bam_statistics_after_filtering_report: get_bam_statistics_after_filtering/log_file macs2_called_peaks: macs2_callpeak/peak_xls_file + atdp_results: average_tag_density/result_file preseq_results: preseq/estimates_file out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] - preseq_plot_data: - label: "Formats sequencing depth estimation data for plotting" - doc: | - Formats estimates file from preseq standard output for QC plotting. This adds a new - column that includes the actual read count point on the plot. - run: ../tools/preseq-plot-data.cwl - in: - preseq_stderr_log_file: preseq/log_file_stderr - estimates_file: preseq/estimates_file - mapped_reads: get_stat/mapped_reads - out: [estimates_file_plot_data] - island_intersect: label: "Peak annotation" doc: | @@ -703,7 +701,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_sort_index_after_rmdup/bam_bai_pair + input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: @@ -716,7 +714,9 @@ steps: default: "chrX chrY" avd_heat_window_bp: default: 200 - mapped_reads: get_stat/mapped_reads + mapped_reads: + source: get_bam_statistics_after_filtering/reads_mapped + valueFrom: $(parseInt(self)) out: [result_file, log_file] @@ -805,9 +805,7 @@ doc: | *samtools\_sort\_index*. Based on workflow’s input parameters indexed and sorted BAM file - can be processed by `samtools rmdup` *samtools\_rmdup* to get rid of duplicated reads. - If removing duplicates is not required the original input BAM and BAI - files return. Otherwise step *samtools\_sort\_index\_after\_rmdup* repeat `samtools sort` and `samtools index` with BAM and BAI files. + can be processed by `samtools markdup` *samtools\_remove\_duplicates* to get rid of duplicated reads. Right after that `macs2 callpeak` performs peak calling *macs2\_callpeak*. On the base of returned outputs the next step *macs2\_island\_count* calculates the number of islands and estimated fragment size. If the last diff --git a/workflows/trim-chipseq-pe.cwl b/workflows/trim-chipseq-pe.cwl index 5d658f2a..e10519f8 100644 --- a/workflows/trim-chipseq-pe.cwl +++ b/workflows/trim-chipseq-pe.cwl @@ -68,7 +68,7 @@ inputs: - File - type: array items: File - label: "FASTQ 1 input file" + label: "FASTQ 1 input file(s)" format: "http://edamontology.org/format_1930" doc: "Reads data in a FASTQ format, received after paired end sequencing" @@ -77,7 +77,7 @@ inputs: - File - type: array items: File - label: "FASTQ 2 input file" + label: "FASTQ 2 input file(s)" format: "http://edamontology.org/format_1930" doc: "Reads data in a FASTQ format, received after paired end sequencing" @@ -121,6 +121,16 @@ inputs: label: "Remove duplicates" doc: "Calls samtools rmdup to remove duplicates from sortesd BAM file" + peak_calling_fdr: + type: float? + default: 0.05 + 'sd:layout': + advanced: true + label: "Minimum FDR (q-value) cutoff for peak detection" + doc: | + Minimum FDR (q-value) cutoff for peak detection. -q, and + -p are mutually exclusive. + promoter_dist: type: int? default: 1000 @@ -272,19 +282,12 @@ outputs: data: [$1, $2] comparable: "atdp" - samtools_rmdup_log: - type: File - label: "Remove duplicates log" - format: "http://edamontology.org/format_2330" - doc: "Samtools rmdup generated log" - outputSource: samtools_rmdup/rmdup_log - bambai_pair: type: File format: "http://edamontology.org/format_2572" label: "Coordinate sorted BAM alignment file (+index BAI)" doc: "Coordinate sorted BAM file and BAI index file" - outputSource: samtools_sort_index_after_rmdup/bam_bai_pair + outputSource: samtools_remove_duplicates/deduplicated_bam_bai_pair 'sd:visualPlugins': - igvbrowser: tab: 'IGV Genome Browser' @@ -451,21 +454,22 @@ outputs: doc: "TrimGalore generated log for FASTQ 2" outputSource: trim_fastq/report_file_pair - preseq_estimates_plot_data: + preseq_estimates: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq_plot_data/estimates_file_plot_data + outputSource: preseq/estimates_file 'sd:visualPlugins': - - line: + - scatter: tab: 'QC Plots' - Title: 'Distinct Read Counts Estimates' - xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' - yAxisTitle: 'Distinct Reads Count' - colors: ["#4b78a3", "#a3514b"] + Title: 'Preseq Estimates' + xAxisTitle: 'Total reads count' + yAxisTitle: 'Expected distinct reads count' + colors: ["#4b78a3"] height: 500 - data: [$2, $5] + data: [$1, $2] + comparable: "preseq" estimated_fragment_size: type: int @@ -489,9 +493,9 @@ steps: the core facility by providing a URL or from GEO by providing SRA accession number. run: ../tools/extract-fastq.cwl in: - output_prefix: - default: "read_1" compressed_file: fastq_file_upstream + output_prefix: + default: "read_1" out: [fastq_file] extract_fastq_downstream: @@ -502,9 +506,9 @@ steps: the core facility by providing a URL or from GEO by providing SRA accession number. run: ../tools/extract-fastq.cwl in: - output_prefix: - default: "read_2" compressed_file: fastq_file_downstream + output_prefix: + default: "read_2" out: [fastq_file] trim_fastq: @@ -595,8 +599,11 @@ steps: bowtie_aligner: label: "Alignment to reference genome" doc: | - Aligns reads to the reference genome keeping only uniquely mapped reads with - less than 3 mismatches. + Aligns reads to the reference genome. + Reads are assumed to be mapped if they + have less than 3 mismatches. + sam_file output includes both mapped + and unmapped reads. run: ../tools/bowtie-alignreads.cwl in: upstream_filelist: rename_upstream/target_file @@ -632,11 +639,14 @@ steps: threads: threads out: [bam_bai_pair] - clean_sam_headers_for_preseq: - run: ../tools/samtools-clean-headers.cwl + samtools_mark_duplicates: + run: ../tools/samtools-markdup.cwl in: - bam_file: samtools_sort_index/bam_bai_pair - out: [preseq_bam] + bam_bai_pair: samtools_sort_index/bam_bai_pair + keep_duplicates: + default: true + threads: threads + out: [deduplicated_bam_bai_pair] preseq: label: "Sequencing depth estimation" @@ -645,32 +655,22 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: clean_sam_headers_for_preseq/preseq_bam + bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair pe_mode: default: true extrapolation: default: 1000000000 - out: [estimates_file, log_file_stdout, log_file_stderr] + out: [estimates_file] - samtools_rmdup: - label: "PCR duplicates removal" - doc: | - Removes potential PCR duplicates. This step is used to remove reads overamplified - in PCR. Unfortunately, it may also remove "good" reads. We do not recommend to - remove duplicates unless the library is heavily duplicated. - run: ../tools/samtools-rmdup.cwl + samtools_remove_duplicates: + run: ../tools/samtools-markdup.cwl in: - trigger: remove_duplicates - bam_file: samtools_sort_index/bam_bai_pair - out: [rmdup_output, rmdup_log] - - samtools_sort_index_after_rmdup: - run: ../tools/samtools-sort-index.cwl - in: - trigger: remove_duplicates - sort_input: samtools_rmdup/rmdup_output + bam_bai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair + keep_duplicates: + source: remove_duplicates + valueFrom: $(!self) threads: threads - out: [bam_bai_pair] + out: [deduplicated_bam_bai_pair] macs2_callpeak: label: "Peak detection" @@ -679,7 +679,7 @@ steps: transcription factor binding sites. run: ../tools/macs2-callpeak-biowardrobe-only.cwl in: - treatment_file: samtools_sort_index_after_rmdup/bam_bai_pair + treatment_file: samtools_remove_duplicates/deduplicated_bam_bai_pair control_file: control_file nolambda: source: control_file @@ -698,8 +698,7 @@ steps: valueFrom: $(!self) keep_dup: default: auto - q_value: - default: 0.05 + q_value: peak_calling_fdr format_mode: default: BAMPE buffer_size: @@ -720,7 +719,7 @@ steps: bam_to_bigwig: run: ../tools/bam-bedgraph-bigwig.cwl in: - bam_file: samtools_sort_index_after_rmdup/bam_bai_pair + bam_file: samtools_remove_duplicates/deduplicated_bam_bai_pair chrom_length_file: chrom_length mapped_reads_number: get_stat/mapped_reads pairchip: @@ -734,20 +733,20 @@ steps: read length and quality score, etc. run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index/bam_bai_pair + bambai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index/bam_bai_pair + source: samtools_mark_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") out: [log_file] get_bam_statistics_after_filtering: run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index_after_rmdup/bam_bai_pair + bambai_pair: samtools_remove_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index_after_rmdup/bam_bai_pair + source: samtools_remove_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report_after_filtering.txt") - out: [log_file, ext_is_section] + out: [log_file, ext_is_section, reads_mapped] get_stat: run: ../tools/collect-statistics-chip-seq.cwl @@ -758,23 +757,12 @@ steps: bam_statistics_report: get_bam_statistics/log_file bam_statistics_after_filtering_report: get_bam_statistics_after_filtering/log_file macs2_called_peaks: macs2_callpeak/peak_xls_file + atdp_results: average_tag_density/result_file preseq_results: preseq/estimates_file paired_end: default: True out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] - preseq_plot_data: - label: "Formats sequencing depth estimation data for plotting" - doc: | - Formats estimates file from preseq standard output for QC plotting. This adds a new - column that includes the actual read count point on the plot. - run: ../tools/preseq-plot-data.cwl - in: - preseq_stderr_log_file: preseq/log_file_stderr - estimates_file: preseq/estimates_file - mapped_reads: get_stat/mapped_reads - out: [estimates_file_plot_data] - island_intersect: label: "Peak annotation" doc: | @@ -795,7 +783,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_sort_index_after_rmdup/bam_bai_pair + input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: @@ -808,7 +796,9 @@ steps: default: "chrX chrY" avd_heat_window_bp: default: 200 - mapped_reads: get_stat/mapped_reads + mapped_reads: + source: get_bam_statistics_after_filtering/reads_mapped + valueFrom: $(parseInt(self/2)) out: [result_file, log_file] @@ -894,11 +884,8 @@ doc: | files to reference genome (Step bowtie_aligner). The output of this step is unsorted SAM file which is being sorted and indexed by samtools sort and samtools index (Step samtools_sort_index). Depending on workflow’s input parameters indexed and sorted BAM file - could be processed by samtools rmdup (Step samtools_rmdup) to remove all possible read duplicates. - In a case when removing duplicates is not necessary the step returns original input BAM and BAI - files without any processing. If the duplicates were removed the following step - (Step samtools_sort_index_after_rmdup) reruns samtools sort and samtools index with BAM and BAI files, - if not - the step returns original unchanged input files. Right after that macs2 callpeak performs + could be processed by `samtools markdup` *samtools\_remove\_duplicates* to remove all possible read duplicates. + Right after that macs2 callpeak performs peak calling (Step macs2_callpeak). On the base of returned outputs the next step (Step macs2_island_count) calculates the number of islands and estimated fragment size. If the last one is less that 80 (hardcoded in a workflow) macs2 callpeak is rerun again with forced fixed diff --git a/workflows/trim-chipseq-se.cwl b/workflows/trim-chipseq-se.cwl index f12ff0a6..687ac248 100644 --- a/workflows/trim-chipseq-se.cwl +++ b/workflows/trim-chipseq-se.cwl @@ -68,7 +68,7 @@ inputs: - File - type: array items: File - label: "FASTQ input file" + label: "FASTQ input file(s)" format: "http://edamontology.org/format_1930" doc: "Reads data in a FASTQ format, received after single end sequencing" @@ -112,6 +112,16 @@ inputs: label: "Remove duplicates" doc: "Calls samtools rmdup to remove duplicates from sortesd BAM file" + peak_calling_fdr: + type: float? + default: 0.05 + 'sd:layout': + advanced: true + label: "Minimum FDR (q-value) cutoff for peak detection" + doc: | + Minimum FDR (q-value) cutoff for peak detection. -q, and + -p are mutually exclusive. + promoter_dist: type: int? default: 1000 @@ -241,19 +251,12 @@ outputs: data: [$1, $2] comparable: "atdp" - samtools_rmdup_log: - type: File - label: "Remove duplicates log" - format: "http://edamontology.org/format_2330" - doc: "Samtools rmdup generated log" - outputSource: samtools_rmdup/rmdup_log - bambai_pair: type: File format: "http://edamontology.org/format_2572" label: "Coordinate sorted BAM alignment file (+index BAI)" doc: "Coordinate sorted BAM file and BAI index file" - outputSource: samtools_sort_index_after_rmdup/bam_bai_pair + outputSource: samtools_remove_duplicates/deduplicated_bam_bai_pair 'sd:visualPlugins': - igvbrowser: tab: 'IGV Genome Browser' @@ -397,21 +400,22 @@ outputs: doc: "TrimGalore generated log" outputSource: trim_fastq/report_file - preseq_estimates_plot_data: + preseq_estimates: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq_plot_data/estimates_file_plot_data + outputSource: preseq/estimates_file 'sd:visualPlugins': - - line: + - scatter: tab: 'QC Plots' - Title: 'Distinct Read Counts Estimates' - xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' - yAxisTitle: 'Distinct Reads Count' - colors: ["#4b78a3", "#a3514b"] + Title: 'Preseq Estimates' + xAxisTitle: 'Total reads count' + yAxisTitle: 'Expected distinct reads count' + colors: ["#4b78a3"] height: 500 - data: [$2, $5] + data: [$1, $2] + comparable: "preseq" estimated_fragment_size: type: int @@ -436,6 +440,8 @@ steps: run: ../tools/extract-fastq.cwl in: compressed_file: fastq_file + output_prefix: + default: "read_1" out: [fastq_file] trim_fastq: @@ -493,8 +499,11 @@ steps: bowtie_aligner: label: "Alignment to reference genome" doc: | - Aligns reads to the reference genome keeping only uniquely mapped reads with - less than 3 mismatches. + Aligns reads to the reference genome. + Reads are assumed to be mapped if they + have less than 3 mismatches. + sam_file output includes both mapped + and unmapped reads. run: ../tools/bowtie-alignreads.cwl in: upstream_filelist: rename/target_file @@ -529,11 +538,14 @@ steps: threads: threads out: [bam_bai_pair] - clean_sam_headers_for_preseq: - run: ../tools/samtools-clean-headers.cwl + samtools_mark_duplicates: + run: ../tools/samtools-markdup.cwl in: - bam_file: samtools_sort_index/bam_bai_pair - out: [preseq_bam] + bam_bai_pair: samtools_sort_index/bam_bai_pair + keep_duplicates: + default: true + threads: threads + out: [deduplicated_bam_bai_pair] preseq: label: "Sequencing depth estimation" @@ -542,32 +554,20 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: clean_sam_headers_for_preseq/preseq_bam + bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair extrapolation: default: 1000000000 - out: [estimates_file, log_file_stdout, log_file_stderr] + out: [estimates_file] - samtools_rmdup: - label: "PCR duplicates removal" - doc: | - Removes potential PCR duplicates. This step is used to remove reads overamplified - in PCR. Unfortunately, it may also remove "good" reads. We do not recommend to - remove duplicates unless the library is heavily duplicated. - run: ../tools/samtools-rmdup.cwl - in: - trigger: remove_duplicates - bam_file: samtools_sort_index/bam_bai_pair - single_end: - default: true - out: [rmdup_output, rmdup_log] - - samtools_sort_index_after_rmdup: - run: ../tools/samtools-sort-index.cwl + samtools_remove_duplicates: + run: ../tools/samtools-markdup.cwl in: - trigger: remove_duplicates - sort_input: samtools_rmdup/rmdup_output + bam_bai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair + keep_duplicates: + source: remove_duplicates + valueFrom: $(!self) threads: threads - out: [bam_bai_pair] + out: [deduplicated_bam_bai_pair] macs2_callpeak: label: "Peak detection" @@ -576,7 +576,7 @@ steps: transcription factor binding sites. run: ../tools/macs2-callpeak-biowardrobe-only.cwl in: - treatment_file: samtools_sort_index_after_rmdup/bam_bai_pair + treatment_file: samtools_remove_duplicates/deduplicated_bam_bai_pair control_file: control_file nolambda: source: control_file @@ -595,8 +595,7 @@ steps: valueFrom: $(!self) keep_dup: default: auto - q_value: - default: 0.05 + q_value: peak_calling_fdr format_mode: default: BAM buffer_size: @@ -617,7 +616,7 @@ steps: bam_to_bigwig: run: ../tools/bam-bedgraph-bigwig.cwl in: - bam_file: samtools_sort_index_after_rmdup/bam_bai_pair + bam_file: samtools_remove_duplicates/deduplicated_bam_bai_pair chrom_length_file: chrom_length mapped_reads_number: get_stat/mapped_reads fragment_size: macs2_callpeak/macs2_fragments_calculated @@ -630,20 +629,20 @@ steps: read length and quality score, etc. run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index/bam_bai_pair + bambai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index/bam_bai_pair + source: samtools_mark_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") out: [log_file] get_bam_statistics_after_filtering: run: ../tools/samtools-stats.cwl in: - bambai_pair: samtools_sort_index_after_rmdup/bam_bai_pair + bambai_pair: samtools_remove_duplicates/deduplicated_bam_bai_pair output_filename: - source: samtools_sort_index_after_rmdup/bam_bai_pair + source: samtools_remove_duplicates/deduplicated_bam_bai_pair valueFrom: $(get_root(self.basename)+"_bam_statistics_report_after_filtering.txt") - out: [log_file] + out: [log_file, reads_mapped] get_stat: run: ../tools/collect-statistics-chip-seq.cwl @@ -653,21 +652,10 @@ steps: bam_statistics_report: get_bam_statistics/log_file bam_statistics_after_filtering_report: get_bam_statistics_after_filtering/log_file macs2_called_peaks: macs2_callpeak/peak_xls_file + atdp_results: average_tag_density/result_file preseq_results: preseq/estimates_file out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] - preseq_plot_data: - label: "Formats sequencing depth estimation data for plotting" - doc: | - Formats estimates file from preseq standard output for QC plotting. This adds a new - column that includes the actual read count point on the plot. - run: ../tools/preseq-plot-data.cwl - in: - preseq_stderr_log_file: preseq/log_file_stderr - estimates_file: preseq/estimates_file - mapped_reads: get_stat/mapped_reads - out: [estimates_file_plot_data] - island_intersect: label: "Peak annotation" doc: | @@ -688,7 +676,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_sort_index_after_rmdup/bam_bai_pair + input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: @@ -701,7 +689,9 @@ steps: default: "chrX chrY" avd_heat_window_bp: default: 200 - mapped_reads: get_stat/mapped_reads + mapped_reads: + source: get_bam_statistics_after_filtering/reads_mapped + valueFrom: $(parseInt(self)) out: [result_file, log_file] @@ -789,9 +779,7 @@ doc: | *samtools\_sort\_index*. Based on workflow’s input parameters indexed and sorted BAM file - can be processed by `samtools rmdup` *samtools\_rmdup* to get rid of duplicated reads. - If removing duplicates is not required the original input BAM and BAI - files return. Otherwise step *samtools\_sort\_index\_after\_rmdup* repeat `samtools sort` and `samtools index` with BAM and BAI files. + can be processed by `samtools markdup` *samtools\_remove\_duplicates* to get rid of duplicated reads. Right after that `macs2 callpeak` performs peak calling *macs2\_callpeak*. On the base of returned outputs the next step *macs2\_island\_count* calculates the number of islands and estimated fragment size. If the last From 2d851c596c1a3401b3b0158260390cfd98850968 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 18 Sep 2023 12:00:14 -0400 Subject: [PATCH 075/162] Need to use old samtools to create an index for atdp --- workflows/chipseq-pe.cwl | 9 ++++++++- workflows/chipseq-se.cwl | 9 ++++++++- workflows/trim-atacseq-pe.cwl | 9 ++++++++- workflows/trim-atacseq-se.cwl | 9 ++++++++- workflows/trim-chipseq-pe.cwl | 9 ++++++++- workflows/trim-chipseq-se.cwl | 9 ++++++++- 6 files changed, 48 insertions(+), 6 deletions(-) diff --git a/workflows/chipseq-pe.cwl b/workflows/chipseq-pe.cwl index 1c5c2e65..b51275c4 100644 --- a/workflows/chipseq-pe.cwl +++ b/workflows/chipseq-pe.cwl @@ -697,6 +697,13 @@ steps: upstream_bp: upstream_dist out: [result_file, log_file] + samtools_sort_index_for_atdp: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: samtools_remove_duplicates/deduplicated_bam_bai_pair + threads: threads + out: [bam_bai_pair] + average_tag_density: label: "Read enrichment around genes TSS" doc: | @@ -704,7 +711,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair + input_file: samtools_sort_index_for_atdp/bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: diff --git a/workflows/chipseq-se.cwl b/workflows/chipseq-se.cwl index 466f2aaf..1bdcf8b1 100644 --- a/workflows/chipseq-se.cwl +++ b/workflows/chipseq-se.cwl @@ -563,6 +563,13 @@ steps: upstream_bp: upstream_dist out: [result_file] + samtools_sort_index_for_atdp: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: samtools_remove_duplicates/deduplicated_bam_bai_pair + threads: threads + out: [bam_bai_pair] + average_tag_density: label: "Read enrichment around genes TSS" doc: | @@ -570,7 +577,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair + input_file: samtools_sort_index_for_atdp/bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: diff --git a/workflows/trim-atacseq-pe.cwl b/workflows/trim-atacseq-pe.cwl index ee59db3d..1736809b 100644 --- a/workflows/trim-atacseq-pe.cwl +++ b/workflows/trim-atacseq-pe.cwl @@ -801,6 +801,13 @@ steps: upstream_bp: upstream_dist out: [result_file, log_file] + samtools_sort_index_for_atdp: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: samtools_remove_duplicates/deduplicated_bam_bai_pair + threads: threads + out: [bam_bai_pair] + average_tag_density: label: "Read enrichment around genes TSS" doc: | @@ -808,7 +815,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair + input_file: samtools_sort_index_for_atdp/bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: diff --git a/workflows/trim-atacseq-se.cwl b/workflows/trim-atacseq-se.cwl index ca85f219..9f90b747 100644 --- a/workflows/trim-atacseq-se.cwl +++ b/workflows/trim-atacseq-se.cwl @@ -694,6 +694,13 @@ steps: upstream_bp: upstream_dist out: [result_file, log_file] + samtools_sort_index_for_atdp: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: samtools_remove_duplicates/deduplicated_bam_bai_pair + threads: threads + out: [bam_bai_pair] + average_tag_density: label: "Read enrichment around genes TSS" doc: | @@ -701,7 +708,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair + input_file: samtools_sort_index_for_atdp/bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: diff --git a/workflows/trim-chipseq-pe.cwl b/workflows/trim-chipseq-pe.cwl index e10519f8..d9ad8c95 100644 --- a/workflows/trim-chipseq-pe.cwl +++ b/workflows/trim-chipseq-pe.cwl @@ -776,6 +776,13 @@ steps: upstream_bp: upstream_dist out: [result_file, log_file] + samtools_sort_index_for_atdp: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: samtools_remove_duplicates/deduplicated_bam_bai_pair + threads: threads + out: [bam_bai_pair] + average_tag_density: label: "Read enrichment around genes TSS" doc: | @@ -783,7 +790,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair + input_file: samtools_sort_index_for_atdp/bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: diff --git a/workflows/trim-chipseq-se.cwl b/workflows/trim-chipseq-se.cwl index 687ac248..a4a65727 100644 --- a/workflows/trim-chipseq-se.cwl +++ b/workflows/trim-chipseq-se.cwl @@ -669,6 +669,13 @@ steps: upstream_bp: upstream_dist out: [result_file, log_file] + samtools_sort_index_for_atdp: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: samtools_remove_duplicates/deduplicated_bam_bai_pair + threads: threads + out: [bam_bai_pair] + average_tag_density: label: "Read enrichment around genes TSS" doc: | @@ -676,7 +683,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair + input_file: samtools_sort_index_for_atdp/bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: From fe2af530b9bbf8a9484682e6a41044e980092ad2 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 18 Sep 2023 12:04:12 -0400 Subject: [PATCH 076/162] Add BAM index with old samtools to make atdp not fail --- workflows/chipseq-pe.cwl | 9 ++++++++- workflows/chipseq-se.cwl | 9 ++++++++- workflows/trim-atacseq-pe.cwl | 9 ++++++++- workflows/trim-atacseq-se.cwl | 9 ++++++++- workflows/trim-chipseq-pe.cwl | 9 ++++++++- workflows/trim-chipseq-se.cwl | 9 ++++++++- 6 files changed, 48 insertions(+), 6 deletions(-) diff --git a/workflows/chipseq-pe.cwl b/workflows/chipseq-pe.cwl index 1c5c2e65..b51275c4 100644 --- a/workflows/chipseq-pe.cwl +++ b/workflows/chipseq-pe.cwl @@ -697,6 +697,13 @@ steps: upstream_bp: upstream_dist out: [result_file, log_file] + samtools_sort_index_for_atdp: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: samtools_remove_duplicates/deduplicated_bam_bai_pair + threads: threads + out: [bam_bai_pair] + average_tag_density: label: "Read enrichment around genes TSS" doc: | @@ -704,7 +711,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair + input_file: samtools_sort_index_for_atdp/bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: diff --git a/workflows/chipseq-se.cwl b/workflows/chipseq-se.cwl index 466f2aaf..1bdcf8b1 100644 --- a/workflows/chipseq-se.cwl +++ b/workflows/chipseq-se.cwl @@ -563,6 +563,13 @@ steps: upstream_bp: upstream_dist out: [result_file] + samtools_sort_index_for_atdp: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: samtools_remove_duplicates/deduplicated_bam_bai_pair + threads: threads + out: [bam_bai_pair] + average_tag_density: label: "Read enrichment around genes TSS" doc: | @@ -570,7 +577,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair + input_file: samtools_sort_index_for_atdp/bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: diff --git a/workflows/trim-atacseq-pe.cwl b/workflows/trim-atacseq-pe.cwl index ee59db3d..1736809b 100644 --- a/workflows/trim-atacseq-pe.cwl +++ b/workflows/trim-atacseq-pe.cwl @@ -801,6 +801,13 @@ steps: upstream_bp: upstream_dist out: [result_file, log_file] + samtools_sort_index_for_atdp: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: samtools_remove_duplicates/deduplicated_bam_bai_pair + threads: threads + out: [bam_bai_pair] + average_tag_density: label: "Read enrichment around genes TSS" doc: | @@ -808,7 +815,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair + input_file: samtools_sort_index_for_atdp/bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: diff --git a/workflows/trim-atacseq-se.cwl b/workflows/trim-atacseq-se.cwl index ca85f219..9f90b747 100644 --- a/workflows/trim-atacseq-se.cwl +++ b/workflows/trim-atacseq-se.cwl @@ -694,6 +694,13 @@ steps: upstream_bp: upstream_dist out: [result_file, log_file] + samtools_sort_index_for_atdp: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: samtools_remove_duplicates/deduplicated_bam_bai_pair + threads: threads + out: [bam_bai_pair] + average_tag_density: label: "Read enrichment around genes TSS" doc: | @@ -701,7 +708,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair + input_file: samtools_sort_index_for_atdp/bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: diff --git a/workflows/trim-chipseq-pe.cwl b/workflows/trim-chipseq-pe.cwl index e10519f8..d9ad8c95 100644 --- a/workflows/trim-chipseq-pe.cwl +++ b/workflows/trim-chipseq-pe.cwl @@ -776,6 +776,13 @@ steps: upstream_bp: upstream_dist out: [result_file, log_file] + samtools_sort_index_for_atdp: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: samtools_remove_duplicates/deduplicated_bam_bai_pair + threads: threads + out: [bam_bai_pair] + average_tag_density: label: "Read enrichment around genes TSS" doc: | @@ -783,7 +790,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair + input_file: samtools_sort_index_for_atdp/bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: diff --git a/workflows/trim-chipseq-se.cwl b/workflows/trim-chipseq-se.cwl index 687ac248..a4a65727 100644 --- a/workflows/trim-chipseq-se.cwl +++ b/workflows/trim-chipseq-se.cwl @@ -669,6 +669,13 @@ steps: upstream_bp: upstream_dist out: [result_file, log_file] + samtools_sort_index_for_atdp: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: samtools_remove_duplicates/deduplicated_bam_bai_pair + threads: threads + out: [bam_bai_pair] + average_tag_density: label: "Read enrichment around genes TSS" doc: | @@ -676,7 +683,7 @@ steps: elements are close to the TSS of their targets. run: ../tools/atdp.cwl in: - input_file: samtools_remove_duplicates/deduplicated_bam_bai_pair + input_file: samtools_sort_index_for_atdp/bam_bai_pair annotation_filename: annotation_file fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated avd_window_bp: From b7242c7bddf129710791207aa5d589545e75e9c3 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 18 Sep 2023 13:01:17 -0400 Subject: [PATCH 077/162] Put back changes that have been mistalenly overwritten --- workflows/chipseq-pe.cwl | 39 ++++++++++++++++++++----------- workflows/chipseq-se.cwl | 43 +++++++++++++++++++++++------------ workflows/trim-atacseq-pe.cwl | 29 +++++++++++++++-------- workflows/trim-atacseq-se.cwl | 29 +++++++++++++++-------- workflows/trim-chipseq-pe.cwl | 29 +++++++++++++++-------- workflows/trim-chipseq-se.cwl | 29 +++++++++++++++-------- 6 files changed, 134 insertions(+), 64 deletions(-) diff --git a/workflows/chipseq-pe.cwl b/workflows/chipseq-pe.cwl index b51275c4..2bff0644 100644 --- a/workflows/chipseq-pe.cwl +++ b/workflows/chipseq-pe.cwl @@ -317,7 +317,8 @@ outputs: id: 'igvbrowser' type: 'annotation' name: "Narrow peaks" - height: 120 + displayMode: "COLLAPSE" + height: 40 macs2_broad_peaks: type: File? @@ -331,7 +332,8 @@ outputs: id: 'igvbrowser' type: 'annotation' name: "Broad peaks" - height: 120 + displayMode: "COLLAPSE" + height: 40 macs2_peak_summits: type: File? @@ -439,22 +441,21 @@ outputs: doc: "fragment, calculated fragment, islands count from MACS2 results" outputSource: macs2_callpeak/macs2_stat_file - preseq_estimates: + preseq_estimates_plot_data: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq/estimates_file + outputSource: preseq_plot_data/estimates_file_plot_data 'sd:visualPlugins': - - scatter: + - line: tab: 'QC Plots' - Title: 'Preseq Estimates' - xAxisTitle: 'Total reads count' - yAxisTitle: 'Expected distinct reads count' - colors: ["#4b78a3"] + Title: 'Distinct Read Counts Estimates' + xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' + yAxisTitle: 'Distinct Reads Count' + colors: ["#4b78a3", "#a3514b"] height: 500 - data: [$1, $2] - comparable: "preseq" + data: [$2, $5] estimated_fragment_size: type: int @@ -684,6 +685,18 @@ steps: default: True out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] + preseq_plot_data: + label: "Formats sequencing depth estimation data for plotting" + doc: | + Formats estimates file from preseq standard output for QC plotting. This adds a new + column that includes the actual read count point on the plot. + run: ../tools/preseq-plot-data.cwl + in: + preseq_stderr_log_file: preseq/log_file_stderr + estimates_file: preseq/estimates_file + mapped_reads: get_stat/mapped_reads + out: [estimates_file_plot_data] + island_intersect: label: "Peak annotation" doc: | @@ -736,8 +749,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "ChIP-Seq pipeline paired-end" -s:name: "ChIP-Seq pipeline paired-end" +label: "Deprecated. ChIP-Seq pipeline paired-end" +s:name: "Deprecated. ChIP-Seq pipeline paired-end" s:alternateName: "ChIP-Seq basic analysis workflow for a paired-end experiment" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/chipseq-pe.cwl diff --git a/workflows/chipseq-se.cwl b/workflows/chipseq-se.cwl index 1bdcf8b1..a16e02cd 100644 --- a/workflows/chipseq-se.cwl +++ b/workflows/chipseq-se.cwl @@ -275,7 +275,8 @@ outputs: id: 'igvbrowser' type: 'annotation' name: "Narrow peaks" - height: 120 + displayMode: "COLLAPSE" + height: 40 macs2_broad_peaks: type: File? @@ -289,7 +290,8 @@ outputs: id: 'igvbrowser' type: 'annotation' name: "Broad peaks" - height: 120 + displayMode: "COLLAPSE" + height: 40 workflow_statistics_yaml: type: File? @@ -338,22 +340,21 @@ outputs: doc: "BAM statistics report (after all filters applied)" outputSource: get_bam_statistics_after_filtering/log_file - preseq_estimates: + preseq_estimates_plot_data: type: File? - label: "Expected Distinct Reads Count Plot" + label: "Preseq estimates" format: "http://edamontology.org/format_3475" - doc: "Expected distinct reads count file from Preseq in TSV format" - outputSource: preseq/estimates_file + doc: "Preseq estimated results" + outputSource: preseq_plot_data/estimates_file_plot_data 'sd:visualPlugins': - - scatter: + - line: tab: 'QC Plots' - Title: 'Expected Distinct Reads Count Plot' - xAxisTitle: 'Total reads count' - yAxisTitle: 'Expected distinct reads count' - colors: ["#4b78a3"] + Title: 'Distinct Read Counts Estimates' + xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' + yAxisTitle: 'Distinct Reads Count' + colors: ["#4b78a3", "#a3514b"] height: 500 - data: [$1, $2] - comparable: "preseq" + data: [$2, $5] estimated_fragment_size: type: int @@ -550,6 +551,18 @@ steps: preseq_results: preseq/estimates_file out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] + preseq_plot_data: + label: "Formats sequencing depth estimation data for plotting" + doc: | + Formats estimates file from preseq standard output for QC plotting. This adds a new + column that includes the actual read count point on the plot. + run: ../tools/preseq-plot-data.cwl + in: + preseq_stderr_log_file: preseq/log_file_stderr + estimates_file: preseq/estimates_file + mapped_reads: get_stat/mapped_reads + out: [estimates_file_plot_data] + island_intersect: label: "Peak annotation" doc: | @@ -602,8 +615,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "ChIP-Seq pipeline single-read" -s:name: "ChIP-Seq pipeline single-read" +label: "Deprecated. ChIP-Seq pipeline single-read" +s:name: "Deprecated. ChIP-Seq pipeline single-read" s:alternateName: "ChIP-Seq basic analysis workflow for single-read data" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/chipseq-se.cwl diff --git a/workflows/trim-atacseq-pe.cwl b/workflows/trim-atacseq-pe.cwl index 1736809b..c7452989 100644 --- a/workflows/trim-atacseq-pe.cwl +++ b/workflows/trim-atacseq-pe.cwl @@ -470,22 +470,21 @@ outputs: doc: "TrimGalore generated log for FASTQ 2" outputSource: trim_fastq/report_file_pair - preseq_estimates: + preseq_estimates_plot_data: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq/estimates_file + outputSource: preseq_plot_data/estimates_file_plot_data 'sd:visualPlugins': - - scatter: + - line: tab: 'QC Plots' - Title: 'Preseq Estimates' - xAxisTitle: 'Total reads count' - yAxisTitle: 'Expected distinct reads count' - colors: ["#4b78a3"] + Title: 'Distinct Read Counts Estimates' + xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' + yAxisTitle: 'Distinct Reads Count' + colors: ["#4b78a3", "#a3514b"] height: 500 - data: [$1, $2] - comparable: "preseq" + data: [$2, $5] estimated_fragment_size: type: int @@ -788,6 +787,18 @@ steps: default: True out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] + preseq_plot_data: + label: "Formats sequencing depth estimation data for plotting" + doc: | + Formats estimates file from preseq standard output for QC plotting. This adds a new + column that includes the actual read count point on the plot. + run: ../tools/preseq-plot-data.cwl + in: + preseq_stderr_log_file: preseq/log_file_stderr + estimates_file: preseq/estimates_file + mapped_reads: get_stat/mapped_reads + out: [estimates_file_plot_data] + island_intersect: label: "Peak annotation" doc: | diff --git a/workflows/trim-atacseq-se.cwl b/workflows/trim-atacseq-se.cwl index 9f90b747..3c4cc27f 100644 --- a/workflows/trim-atacseq-se.cwl +++ b/workflows/trim-atacseq-se.cwl @@ -415,22 +415,21 @@ outputs: doc: "TrimGalore generated log" outputSource: trim_fastq/report_file - preseq_estimates: + preseq_estimates_plot_data: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq/estimates_file + outputSource: preseq_plot_data/estimates_file_plot_data 'sd:visualPlugins': - - scatter: + - line: tab: 'QC Plots' - Title: 'Preseq Estimates' - xAxisTitle: 'Total reads count' - yAxisTitle: 'Expected distinct reads count' - colors: ["#4b78a3"] + Title: 'Distinct Read Counts Estimates' + xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' + yAxisTitle: 'Distinct Reads Count' + colors: ["#4b78a3", "#a3514b"] height: 500 - data: [$1, $2] - comparable: "preseq" + data: [$2, $5] estimated_fragment_size: type: int @@ -681,6 +680,18 @@ steps: preseq_results: preseq/estimates_file out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] + preseq_plot_data: + label: "Formats sequencing depth estimation data for plotting" + doc: | + Formats estimates file from preseq standard output for QC plotting. This adds a new + column that includes the actual read count point on the plot. + run: ../tools/preseq-plot-data.cwl + in: + preseq_stderr_log_file: preseq/log_file_stderr + estimates_file: preseq/estimates_file + mapped_reads: get_stat/mapped_reads + out: [estimates_file_plot_data] + island_intersect: label: "Peak annotation" doc: | diff --git a/workflows/trim-chipseq-pe.cwl b/workflows/trim-chipseq-pe.cwl index d9ad8c95..cfcdd512 100644 --- a/workflows/trim-chipseq-pe.cwl +++ b/workflows/trim-chipseq-pe.cwl @@ -454,22 +454,21 @@ outputs: doc: "TrimGalore generated log for FASTQ 2" outputSource: trim_fastq/report_file_pair - preseq_estimates: + preseq_estimates_plot_data: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq/estimates_file + outputSource: preseq_plot_data/estimates_file_plot_data 'sd:visualPlugins': - - scatter: + - line: tab: 'QC Plots' - Title: 'Preseq Estimates' - xAxisTitle: 'Total reads count' - yAxisTitle: 'Expected distinct reads count' - colors: ["#4b78a3"] + Title: 'Distinct Read Counts Estimates' + xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' + yAxisTitle: 'Distinct Reads Count' + colors: ["#4b78a3", "#a3514b"] height: 500 - data: [$1, $2] - comparable: "preseq" + data: [$2, $5] estimated_fragment_size: type: int @@ -763,6 +762,18 @@ steps: default: True out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] + preseq_plot_data: + label: "Formats sequencing depth estimation data for plotting" + doc: | + Formats estimates file from preseq standard output for QC plotting. This adds a new + column that includes the actual read count point on the plot. + run: ../tools/preseq-plot-data.cwl + in: + preseq_stderr_log_file: preseq/log_file_stderr + estimates_file: preseq/estimates_file + mapped_reads: get_stat/mapped_reads + out: [estimates_file_plot_data] + island_intersect: label: "Peak annotation" doc: | diff --git a/workflows/trim-chipseq-se.cwl b/workflows/trim-chipseq-se.cwl index a4a65727..4a9a372d 100644 --- a/workflows/trim-chipseq-se.cwl +++ b/workflows/trim-chipseq-se.cwl @@ -400,22 +400,21 @@ outputs: doc: "TrimGalore generated log" outputSource: trim_fastq/report_file - preseq_estimates: + preseq_estimates_plot_data: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq/estimates_file + outputSource: preseq_plot_data/estimates_file_plot_data 'sd:visualPlugins': - - scatter: + - line: tab: 'QC Plots' - Title: 'Preseq Estimates' - xAxisTitle: 'Total reads count' - yAxisTitle: 'Expected distinct reads count' - colors: ["#4b78a3"] + Title: 'Distinct Read Counts Estimates' + xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' + yAxisTitle: 'Distinct Reads Count' + colors: ["#4b78a3", "#a3514b"] height: 500 - data: [$1, $2] - comparable: "preseq" + data: [$2, $5] estimated_fragment_size: type: int @@ -656,6 +655,18 @@ steps: preseq_results: preseq/estimates_file out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] + preseq_plot_data: + label: "Formats sequencing depth estimation data for plotting" + doc: | + Formats estimates file from preseq standard output for QC plotting. This adds a new + column that includes the actual read count point on the plot. + run: ../tools/preseq-plot-data.cwl + in: + preseq_stderr_log_file: preseq/log_file_stderr + estimates_file: preseq/estimates_file + mapped_reads: get_stat/mapped_reads + out: [estimates_file_plot_data] + island_intersect: label: "Peak annotation" doc: | From af61e4858bceb0601c0bafd772a6806fa5ae10c2 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 18 Sep 2023 13:03:44 -0400 Subject: [PATCH 078/162] Not important changes --- workflows/chipseq-pe.cwl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/chipseq-pe.cwl b/workflows/chipseq-pe.cwl index 2bff0644..e7618770 100644 --- a/workflows/chipseq-pe.cwl +++ b/workflows/chipseq-pe.cwl @@ -360,8 +360,8 @@ outputs: tab: 'IGV Genome Browser' id: 'igvbrowser' type: 'annotation' - name: "Gapped peaks" - height: 120 + displayMode: "COLLAPSE" + height: 40 macs2_log: type: File? From 8447aac6d2c72d2e21232ebe77642512104c415d Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 18 Sep 2023 13:04:54 -0400 Subject: [PATCH 079/162] Not impartant changes --- workflows/chipseq-pe.cwl | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/chipseq-pe.cwl b/workflows/chipseq-pe.cwl index e7618770..96483bd0 100644 --- a/workflows/chipseq-pe.cwl +++ b/workflows/chipseq-pe.cwl @@ -360,6 +360,7 @@ outputs: tab: 'IGV Genome Browser' id: 'igvbrowser' type: 'annotation' + name: "Gapped peaks" displayMode: "COLLAPSE" height: 40 From 8878f2edf39ad706f89c00b347c3ae30c1c13f9e Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 18 Sep 2023 13:12:49 -0400 Subject: [PATCH 080/162] Put back proper preseq --- workflows/chipseq-pe.cwl | 8 +++++++- workflows/chipseq-se.cwl | 8 +++++++- workflows/trim-atacseq-pe.cwl | 8 +++++++- workflows/trim-atacseq-se.cwl | 8 +++++++- workflows/trim-chipseq-pe.cwl | 8 +++++++- workflows/trim-chipseq-se.cwl | 8 +++++++- 6 files changed, 42 insertions(+), 6 deletions(-) diff --git a/workflows/chipseq-pe.cwl b/workflows/chipseq-pe.cwl index 96483bd0..d4228150 100644 --- a/workflows/chipseq-pe.cwl +++ b/workflows/chipseq-pe.cwl @@ -573,6 +573,12 @@ steps: threads: threads out: [deduplicated_bam_bai_pair] + clean_sam_headers_for_preseq: + run: ../tools/samtools-clean-headers.cwl + in: + bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair + out: [preseq_bam] + preseq: label: "Sequencing depth estimation" doc: | @@ -580,7 +586,7 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair + bam_file: clean_sam_headers_for_preseq/preseq_bam pe_mode: default: true extrapolation: diff --git a/workflows/chipseq-se.cwl b/workflows/chipseq-se.cwl index a16e02cd..73ff4b9c 100644 --- a/workflows/chipseq-se.cwl +++ b/workflows/chipseq-se.cwl @@ -450,6 +450,12 @@ steps: threads: threads out: [deduplicated_bam_bai_pair] + clean_sam_headers_for_preseq: + run: ../tools/samtools-clean-headers.cwl + in: + bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair + out: [preseq_bam] + preseq: label: "Sequencing depth estimation" doc: | @@ -457,7 +463,7 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair + bam_file: clean_sam_headers_for_preseq/preseq_bam extrapolation: default: 1000000000 out: [estimates_file] diff --git a/workflows/trim-atacseq-pe.cwl b/workflows/trim-atacseq-pe.cwl index c7452989..947d128f 100644 --- a/workflows/trim-atacseq-pe.cwl +++ b/workflows/trim-atacseq-pe.cwl @@ -670,6 +670,12 @@ steps: exclude_chromosome: exclude_chromosome out: [filtered_bam_bai_pair] + clean_sam_headers_for_preseq: + run: ../tools/samtools-clean-headers.cwl + in: + bam_file: filter_bam/filtered_bam_bai_pair + out: [preseq_bam] + preseq: label: "Sequencing depth estimation" doc: | @@ -677,7 +683,7 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: filter_bam/filtered_bam_bai_pair + bam_file: clean_sam_headers_for_preseq/preseq_bam pe_mode: default: true extrapolation: diff --git a/workflows/trim-atacseq-se.cwl b/workflows/trim-atacseq-se.cwl index 3c4cc27f..4aeff2ec 100644 --- a/workflows/trim-atacseq-se.cwl +++ b/workflows/trim-atacseq-se.cwl @@ -568,6 +568,12 @@ steps: exclude_chromosome: exclude_chromosome out: [filtered_bam_bai_pair] + clean_sam_headers_for_preseq: + run: ../tools/samtools-clean-headers.cwl + in: + bam_file: filter_bam/filtered_bam_bai_pair + out: [preseq_bam] + preseq: label: "Sequencing depth estimation" doc: | @@ -575,7 +581,7 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: filter_bam/filtered_bam_bai_pair + bam_file: clean_sam_headers_for_preseq/preseq_bam extrapolation: default: 1000000000 out: [estimates_file] diff --git a/workflows/trim-chipseq-pe.cwl b/workflows/trim-chipseq-pe.cwl index cfcdd512..c6f8c59b 100644 --- a/workflows/trim-chipseq-pe.cwl +++ b/workflows/trim-chipseq-pe.cwl @@ -647,6 +647,12 @@ steps: threads: threads out: [deduplicated_bam_bai_pair] + clean_sam_headers_for_preseq: + run: ../tools/samtools-clean-headers.cwl + in: + bam_file: filter_bam/filtered_bam_bai_pair + out: [preseq_bam] + preseq: label: "Sequencing depth estimation" doc: | @@ -654,7 +660,7 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair + bam_file: clean_sam_headers_for_preseq/preseq_bam pe_mode: default: true extrapolation: diff --git a/workflows/trim-chipseq-se.cwl b/workflows/trim-chipseq-se.cwl index 4a9a372d..bcc8f22c 100644 --- a/workflows/trim-chipseq-se.cwl +++ b/workflows/trim-chipseq-se.cwl @@ -546,6 +546,12 @@ steps: threads: threads out: [deduplicated_bam_bai_pair] + clean_sam_headers_for_preseq: + run: ../tools/samtools-clean-headers.cwl + in: + bam_file: filter_bam/filtered_bam_bai_pair + out: [preseq_bam] + preseq: label: "Sequencing depth estimation" doc: | @@ -553,7 +559,7 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair + bam_file: clean_sam_headers_for_preseq/preseq_bam extrapolation: default: 1000000000 out: [estimates_file] From aa1c97b2b89b7a61ea29a06e094d92553b091358 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 18 Sep 2023 13:18:28 -0400 Subject: [PATCH 081/162] Not imporant changes --- workflows/chipseq-se.cwl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/workflows/chipseq-se.cwl b/workflows/chipseq-se.cwl index 73ff4b9c..7bbbab01 100644 --- a/workflows/chipseq-se.cwl +++ b/workflows/chipseq-se.cwl @@ -298,14 +298,14 @@ outputs: label: "YAML formatted combined log" format: "http://edamontology.org/format_3750" doc: "YAML formatted combined log" - outputSource: get_statistics/collected_statistics_yaml + outputSource: get_stat/collected_statistics_yaml workflow_statistics_markdown: type: File? label: "Markdown formatted combined log" format: "http://edamontology.org/format_3835" doc: "Markdown formatted combined log" - outputSource: get_statistics/collected_statistics_md + outputSource: get_stat/collected_statistics_md 'sd:visualPlugins': - markdownView: tab: 'Overview' @@ -315,7 +315,7 @@ outputs: label: "Workflow execution statistics" format: "http://edamontology.org/format_3475" doc: "Overall workflow execution statistics from bowtie_aligner and samtools_rmdup steps" - outputSource: get_statistics/collected_statistics_tsv + outputSource: get_stat/collected_statistics_tsv 'sd:visualPlugins': - tableView: vertical: true @@ -366,7 +366,7 @@ outputs: type: int label: "Mapped reads number" doc: "Mapped reads number for downstream analyses" - outputSource: get_statistics/mapped_reads + outputSource: get_stat/mapped_reads steps: @@ -520,7 +520,7 @@ steps: in: bam_file: samtools_remove_duplicates/deduplicated_bam_bai_pair chrom_length_file: chrom_length - mapped_reads_number: get_statistics/mapped_reads + mapped_reads_number: get_stat/mapped_reads fragment_size: macs2_callpeak/macs2_fragments_calculated out: [bigwig_file] @@ -546,7 +546,7 @@ steps: valueFrom: $(get_root(self.basename)+"_bam_statistics_report_after_filtering.txt") out: [log_file, reads_mapped] - get_statistics: + get_stat: run: ../tools/collect-statistics-chip-seq.cwl in: bowtie_alignment_report: bowtie_aligner/log_file From 6c0703a6b51439fd0b781961ee7afd69ea9c1d2d Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 18 Sep 2023 13:29:37 -0400 Subject: [PATCH 082/162] Put back preseq logs outputs --- workflows/chipseq-pe.cwl | 2 +- workflows/chipseq-se.cwl | 2 +- workflows/trim-atacseq-pe.cwl | 2 +- workflows/trim-atacseq-se.cwl | 2 +- workflows/trim-chipseq-pe.cwl | 2 +- workflows/trim-chipseq-se.cwl | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/workflows/chipseq-pe.cwl b/workflows/chipseq-pe.cwl index d4228150..2aa5568f 100644 --- a/workflows/chipseq-pe.cwl +++ b/workflows/chipseq-pe.cwl @@ -591,7 +591,7 @@ steps: default: true extrapolation: default: 1000000000 - out: [estimates_file] + out: [estimates_file, log_file_stdout, log_file_stderr] samtools_remove_duplicates: run: ../tools/samtools-markdup.cwl diff --git a/workflows/chipseq-se.cwl b/workflows/chipseq-se.cwl index 7bbbab01..d6f83eb5 100644 --- a/workflows/chipseq-se.cwl +++ b/workflows/chipseq-se.cwl @@ -466,7 +466,7 @@ steps: bam_file: clean_sam_headers_for_preseq/preseq_bam extrapolation: default: 1000000000 - out: [estimates_file] + out: [estimates_file, log_file_stdout, log_file_stderr] samtools_remove_duplicates: run: ../tools/samtools-markdup.cwl diff --git a/workflows/trim-atacseq-pe.cwl b/workflows/trim-atacseq-pe.cwl index 947d128f..fdc13c52 100644 --- a/workflows/trim-atacseq-pe.cwl +++ b/workflows/trim-atacseq-pe.cwl @@ -688,7 +688,7 @@ steps: default: true extrapolation: default: 1000000000 - out: [estimates_file] + out: [estimates_file, log_file_stdout, log_file_stderr] samtools_remove_duplicates: run: ../tools/samtools-markdup.cwl diff --git a/workflows/trim-atacseq-se.cwl b/workflows/trim-atacseq-se.cwl index 4aeff2ec..7c704049 100644 --- a/workflows/trim-atacseq-se.cwl +++ b/workflows/trim-atacseq-se.cwl @@ -584,7 +584,7 @@ steps: bam_file: clean_sam_headers_for_preseq/preseq_bam extrapolation: default: 1000000000 - out: [estimates_file] + out: [estimates_file, log_file_stdout, log_file_stderr] samtools_remove_duplicates: run: ../tools/samtools-markdup.cwl diff --git a/workflows/trim-chipseq-pe.cwl b/workflows/trim-chipseq-pe.cwl index c6f8c59b..e9c095de 100644 --- a/workflows/trim-chipseq-pe.cwl +++ b/workflows/trim-chipseq-pe.cwl @@ -665,7 +665,7 @@ steps: default: true extrapolation: default: 1000000000 - out: [estimates_file] + out: [estimates_file, log_file_stdout, log_file_stderr] samtools_remove_duplicates: run: ../tools/samtools-markdup.cwl diff --git a/workflows/trim-chipseq-se.cwl b/workflows/trim-chipseq-se.cwl index bcc8f22c..6f758fb9 100644 --- a/workflows/trim-chipseq-se.cwl +++ b/workflows/trim-chipseq-se.cwl @@ -562,7 +562,7 @@ steps: bam_file: clean_sam_headers_for_preseq/preseq_bam extrapolation: default: 1000000000 - out: [estimates_file] + out: [estimates_file, log_file_stdout, log_file_stderr] samtools_remove_duplicates: run: ../tools/samtools-markdup.cwl From de59f003fcd76392ae8f8f56995f25e094a5d186 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 18 Sep 2023 13:36:01 -0400 Subject: [PATCH 083/162] Fix bug in connecting preseq step --- workflows/trim-chipseq-pe.cwl | 2 +- workflows/trim-chipseq-se.cwl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/trim-chipseq-pe.cwl b/workflows/trim-chipseq-pe.cwl index e9c095de..b958e75f 100644 --- a/workflows/trim-chipseq-pe.cwl +++ b/workflows/trim-chipseq-pe.cwl @@ -650,7 +650,7 @@ steps: clean_sam_headers_for_preseq: run: ../tools/samtools-clean-headers.cwl in: - bam_file: filter_bam/filtered_bam_bai_pair + bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair out: [preseq_bam] preseq: diff --git a/workflows/trim-chipseq-se.cwl b/workflows/trim-chipseq-se.cwl index 6f758fb9..296e27d0 100644 --- a/workflows/trim-chipseq-se.cwl +++ b/workflows/trim-chipseq-se.cwl @@ -549,7 +549,7 @@ steps: clean_sam_headers_for_preseq: run: ../tools/samtools-clean-headers.cwl in: - bam_file: filter_bam/filtered_bam_bai_pair + bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair out: [preseq_bam] preseq: From d8522afbbdfc2c0b85e96ddafbd31d517556e46a Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 18 Sep 2023 14:20:16 -0400 Subject: [PATCH 084/162] COmbine global changes with preseq and local changes with PCR duplicates --- workflows/chipseq-pe.cwl | 52 +++++++++++++++++++--------- workflows/chipseq-se.cwl | 65 ++++++++++++++++++++++------------- workflows/trim-atacseq-pe.cwl | 39 +++++++++++++++------ workflows/trim-atacseq-se.cwl | 39 +++++++++++++++------ workflows/trim-chipseq-pe.cwl | 39 +++++++++++++++------ workflows/trim-chipseq-se.cwl | 39 +++++++++++++++------ 6 files changed, 190 insertions(+), 83 deletions(-) diff --git a/workflows/chipseq-pe.cwl b/workflows/chipseq-pe.cwl index b51275c4..2aa5568f 100644 --- a/workflows/chipseq-pe.cwl +++ b/workflows/chipseq-pe.cwl @@ -317,7 +317,8 @@ outputs: id: 'igvbrowser' type: 'annotation' name: "Narrow peaks" - height: 120 + displayMode: "COLLAPSE" + height: 40 macs2_broad_peaks: type: File? @@ -331,7 +332,8 @@ outputs: id: 'igvbrowser' type: 'annotation' name: "Broad peaks" - height: 120 + displayMode: "COLLAPSE" + height: 40 macs2_peak_summits: type: File? @@ -359,7 +361,8 @@ outputs: id: 'igvbrowser' type: 'annotation' name: "Gapped peaks" - height: 120 + displayMode: "COLLAPSE" + height: 40 macs2_log: type: File? @@ -439,22 +442,21 @@ outputs: doc: "fragment, calculated fragment, islands count from MACS2 results" outputSource: macs2_callpeak/macs2_stat_file - preseq_estimates: + preseq_estimates_plot_data: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq/estimates_file + outputSource: preseq_plot_data/estimates_file_plot_data 'sd:visualPlugins': - - scatter: + - line: tab: 'QC Plots' - Title: 'Preseq Estimates' - xAxisTitle: 'Total reads count' - yAxisTitle: 'Expected distinct reads count' - colors: ["#4b78a3"] + Title: 'Distinct Read Counts Estimates' + xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' + yAxisTitle: 'Distinct Reads Count' + colors: ["#4b78a3", "#a3514b"] height: 500 - data: [$1, $2] - comparable: "preseq" + data: [$2, $5] estimated_fragment_size: type: int @@ -571,6 +573,12 @@ steps: threads: threads out: [deduplicated_bam_bai_pair] + clean_sam_headers_for_preseq: + run: ../tools/samtools-clean-headers.cwl + in: + bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair + out: [preseq_bam] + preseq: label: "Sequencing depth estimation" doc: | @@ -578,12 +586,12 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair + bam_file: clean_sam_headers_for_preseq/preseq_bam pe_mode: default: true extrapolation: default: 1000000000 - out: [estimates_file] + out: [estimates_file, log_file_stdout, log_file_stderr] samtools_remove_duplicates: run: ../tools/samtools-markdup.cwl @@ -684,6 +692,18 @@ steps: default: True out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] + preseq_plot_data: + label: "Formats sequencing depth estimation data for plotting" + doc: | + Formats estimates file from preseq standard output for QC plotting. This adds a new + column that includes the actual read count point on the plot. + run: ../tools/preseq-plot-data.cwl + in: + preseq_stderr_log_file: preseq/log_file_stderr + estimates_file: preseq/estimates_file + mapped_reads: get_stat/mapped_reads + out: [estimates_file_plot_data] + island_intersect: label: "Peak annotation" doc: | @@ -736,8 +756,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "ChIP-Seq pipeline paired-end" -s:name: "ChIP-Seq pipeline paired-end" +label: "Deprecated. ChIP-Seq pipeline paired-end" +s:name: "Deprecated. ChIP-Seq pipeline paired-end" s:alternateName: "ChIP-Seq basic analysis workflow for a paired-end experiment" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/chipseq-pe.cwl diff --git a/workflows/chipseq-se.cwl b/workflows/chipseq-se.cwl index 1bdcf8b1..d6f83eb5 100644 --- a/workflows/chipseq-se.cwl +++ b/workflows/chipseq-se.cwl @@ -275,7 +275,8 @@ outputs: id: 'igvbrowser' type: 'annotation' name: "Narrow peaks" - height: 120 + displayMode: "COLLAPSE" + height: 40 macs2_broad_peaks: type: File? @@ -289,21 +290,22 @@ outputs: id: 'igvbrowser' type: 'annotation' name: "Broad peaks" - height: 120 + displayMode: "COLLAPSE" + height: 40 workflow_statistics_yaml: type: File? label: "YAML formatted combined log" format: "http://edamontology.org/format_3750" doc: "YAML formatted combined log" - outputSource: get_statistics/collected_statistics_yaml + outputSource: get_stat/collected_statistics_yaml workflow_statistics_markdown: type: File? label: "Markdown formatted combined log" format: "http://edamontology.org/format_3835" doc: "Markdown formatted combined log" - outputSource: get_statistics/collected_statistics_md + outputSource: get_stat/collected_statistics_md 'sd:visualPlugins': - markdownView: tab: 'Overview' @@ -313,7 +315,7 @@ outputs: label: "Workflow execution statistics" format: "http://edamontology.org/format_3475" doc: "Overall workflow execution statistics from bowtie_aligner and samtools_rmdup steps" - outputSource: get_statistics/collected_statistics_tsv + outputSource: get_stat/collected_statistics_tsv 'sd:visualPlugins': - tableView: vertical: true @@ -338,22 +340,21 @@ outputs: doc: "BAM statistics report (after all filters applied)" outputSource: get_bam_statistics_after_filtering/log_file - preseq_estimates: + preseq_estimates_plot_data: type: File? - label: "Expected Distinct Reads Count Plot" + label: "Preseq estimates" format: "http://edamontology.org/format_3475" - doc: "Expected distinct reads count file from Preseq in TSV format" - outputSource: preseq/estimates_file + doc: "Preseq estimated results" + outputSource: preseq_plot_data/estimates_file_plot_data 'sd:visualPlugins': - - scatter: + - line: tab: 'QC Plots' - Title: 'Expected Distinct Reads Count Plot' - xAxisTitle: 'Total reads count' - yAxisTitle: 'Expected distinct reads count' - colors: ["#4b78a3"] + Title: 'Distinct Read Counts Estimates' + xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' + yAxisTitle: 'Distinct Reads Count' + colors: ["#4b78a3", "#a3514b"] height: 500 - data: [$1, $2] - comparable: "preseq" + data: [$2, $5] estimated_fragment_size: type: int @@ -365,7 +366,7 @@ outputs: type: int label: "Mapped reads number" doc: "Mapped reads number for downstream analyses" - outputSource: get_statistics/mapped_reads + outputSource: get_stat/mapped_reads steps: @@ -449,6 +450,12 @@ steps: threads: threads out: [deduplicated_bam_bai_pair] + clean_sam_headers_for_preseq: + run: ../tools/samtools-clean-headers.cwl + in: + bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair + out: [preseq_bam] + preseq: label: "Sequencing depth estimation" doc: | @@ -456,10 +463,10 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair + bam_file: clean_sam_headers_for_preseq/preseq_bam extrapolation: default: 1000000000 - out: [estimates_file] + out: [estimates_file, log_file_stdout, log_file_stderr] samtools_remove_duplicates: run: ../tools/samtools-markdup.cwl @@ -513,7 +520,7 @@ steps: in: bam_file: samtools_remove_duplicates/deduplicated_bam_bai_pair chrom_length_file: chrom_length - mapped_reads_number: get_statistics/mapped_reads + mapped_reads_number: get_stat/mapped_reads fragment_size: macs2_callpeak/macs2_fragments_calculated out: [bigwig_file] @@ -539,7 +546,7 @@ steps: valueFrom: $(get_root(self.basename)+"_bam_statistics_report_after_filtering.txt") out: [log_file, reads_mapped] - get_statistics: + get_stat: run: ../tools/collect-statistics-chip-seq.cwl in: bowtie_alignment_report: bowtie_aligner/log_file @@ -550,6 +557,18 @@ steps: preseq_results: preseq/estimates_file out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] + preseq_plot_data: + label: "Formats sequencing depth estimation data for plotting" + doc: | + Formats estimates file from preseq standard output for QC plotting. This adds a new + column that includes the actual read count point on the plot. + run: ../tools/preseq-plot-data.cwl + in: + preseq_stderr_log_file: preseq/log_file_stderr + estimates_file: preseq/estimates_file + mapped_reads: get_stat/mapped_reads + out: [estimates_file_plot_data] + island_intersect: label: "Peak annotation" doc: | @@ -602,8 +621,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "ChIP-Seq pipeline single-read" -s:name: "ChIP-Seq pipeline single-read" +label: "Deprecated. ChIP-Seq pipeline single-read" +s:name: "Deprecated. ChIP-Seq pipeline single-read" s:alternateName: "ChIP-Seq basic analysis workflow for single-read data" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/chipseq-se.cwl diff --git a/workflows/trim-atacseq-pe.cwl b/workflows/trim-atacseq-pe.cwl index 1736809b..fdc13c52 100644 --- a/workflows/trim-atacseq-pe.cwl +++ b/workflows/trim-atacseq-pe.cwl @@ -470,22 +470,21 @@ outputs: doc: "TrimGalore generated log for FASTQ 2" outputSource: trim_fastq/report_file_pair - preseq_estimates: + preseq_estimates_plot_data: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq/estimates_file + outputSource: preseq_plot_data/estimates_file_plot_data 'sd:visualPlugins': - - scatter: + - line: tab: 'QC Plots' - Title: 'Preseq Estimates' - xAxisTitle: 'Total reads count' - yAxisTitle: 'Expected distinct reads count' - colors: ["#4b78a3"] + Title: 'Distinct Read Counts Estimates' + xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' + yAxisTitle: 'Distinct Reads Count' + colors: ["#4b78a3", "#a3514b"] height: 500 - data: [$1, $2] - comparable: "preseq" + data: [$2, $5] estimated_fragment_size: type: int @@ -671,6 +670,12 @@ steps: exclude_chromosome: exclude_chromosome out: [filtered_bam_bai_pair] + clean_sam_headers_for_preseq: + run: ../tools/samtools-clean-headers.cwl + in: + bam_file: filter_bam/filtered_bam_bai_pair + out: [preseq_bam] + preseq: label: "Sequencing depth estimation" doc: | @@ -678,12 +683,12 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: filter_bam/filtered_bam_bai_pair + bam_file: clean_sam_headers_for_preseq/preseq_bam pe_mode: default: true extrapolation: default: 1000000000 - out: [estimates_file] + out: [estimates_file, log_file_stdout, log_file_stderr] samtools_remove_duplicates: run: ../tools/samtools-markdup.cwl @@ -788,6 +793,18 @@ steps: default: True out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] + preseq_plot_data: + label: "Formats sequencing depth estimation data for plotting" + doc: | + Formats estimates file from preseq standard output for QC plotting. This adds a new + column that includes the actual read count point on the plot. + run: ../tools/preseq-plot-data.cwl + in: + preseq_stderr_log_file: preseq/log_file_stderr + estimates_file: preseq/estimates_file + mapped_reads: get_stat/mapped_reads + out: [estimates_file_plot_data] + island_intersect: label: "Peak annotation" doc: | diff --git a/workflows/trim-atacseq-se.cwl b/workflows/trim-atacseq-se.cwl index 9f90b747..7c704049 100644 --- a/workflows/trim-atacseq-se.cwl +++ b/workflows/trim-atacseq-se.cwl @@ -415,22 +415,21 @@ outputs: doc: "TrimGalore generated log" outputSource: trim_fastq/report_file - preseq_estimates: + preseq_estimates_plot_data: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq/estimates_file + outputSource: preseq_plot_data/estimates_file_plot_data 'sd:visualPlugins': - - scatter: + - line: tab: 'QC Plots' - Title: 'Preseq Estimates' - xAxisTitle: 'Total reads count' - yAxisTitle: 'Expected distinct reads count' - colors: ["#4b78a3"] + Title: 'Distinct Read Counts Estimates' + xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' + yAxisTitle: 'Distinct Reads Count' + colors: ["#4b78a3", "#a3514b"] height: 500 - data: [$1, $2] - comparable: "preseq" + data: [$2, $5] estimated_fragment_size: type: int @@ -569,6 +568,12 @@ steps: exclude_chromosome: exclude_chromosome out: [filtered_bam_bai_pair] + clean_sam_headers_for_preseq: + run: ../tools/samtools-clean-headers.cwl + in: + bam_file: filter_bam/filtered_bam_bai_pair + out: [preseq_bam] + preseq: label: "Sequencing depth estimation" doc: | @@ -576,10 +581,10 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: filter_bam/filtered_bam_bai_pair + bam_file: clean_sam_headers_for_preseq/preseq_bam extrapolation: default: 1000000000 - out: [estimates_file] + out: [estimates_file, log_file_stdout, log_file_stderr] samtools_remove_duplicates: run: ../tools/samtools-markdup.cwl @@ -681,6 +686,18 @@ steps: preseq_results: preseq/estimates_file out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] + preseq_plot_data: + label: "Formats sequencing depth estimation data for plotting" + doc: | + Formats estimates file from preseq standard output for QC plotting. This adds a new + column that includes the actual read count point on the plot. + run: ../tools/preseq-plot-data.cwl + in: + preseq_stderr_log_file: preseq/log_file_stderr + estimates_file: preseq/estimates_file + mapped_reads: get_stat/mapped_reads + out: [estimates_file_plot_data] + island_intersect: label: "Peak annotation" doc: | diff --git a/workflows/trim-chipseq-pe.cwl b/workflows/trim-chipseq-pe.cwl index d9ad8c95..b958e75f 100644 --- a/workflows/trim-chipseq-pe.cwl +++ b/workflows/trim-chipseq-pe.cwl @@ -454,22 +454,21 @@ outputs: doc: "TrimGalore generated log for FASTQ 2" outputSource: trim_fastq/report_file_pair - preseq_estimates: + preseq_estimates_plot_data: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq/estimates_file + outputSource: preseq_plot_data/estimates_file_plot_data 'sd:visualPlugins': - - scatter: + - line: tab: 'QC Plots' - Title: 'Preseq Estimates' - xAxisTitle: 'Total reads count' - yAxisTitle: 'Expected distinct reads count' - colors: ["#4b78a3"] + Title: 'Distinct Read Counts Estimates' + xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' + yAxisTitle: 'Distinct Reads Count' + colors: ["#4b78a3", "#a3514b"] height: 500 - data: [$1, $2] - comparable: "preseq" + data: [$2, $5] estimated_fragment_size: type: int @@ -648,6 +647,12 @@ steps: threads: threads out: [deduplicated_bam_bai_pair] + clean_sam_headers_for_preseq: + run: ../tools/samtools-clean-headers.cwl + in: + bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair + out: [preseq_bam] + preseq: label: "Sequencing depth estimation" doc: | @@ -655,12 +660,12 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair + bam_file: clean_sam_headers_for_preseq/preseq_bam pe_mode: default: true extrapolation: default: 1000000000 - out: [estimates_file] + out: [estimates_file, log_file_stdout, log_file_stderr] samtools_remove_duplicates: run: ../tools/samtools-markdup.cwl @@ -763,6 +768,18 @@ steps: default: True out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] + preseq_plot_data: + label: "Formats sequencing depth estimation data for plotting" + doc: | + Formats estimates file from preseq standard output for QC plotting. This adds a new + column that includes the actual read count point on the plot. + run: ../tools/preseq-plot-data.cwl + in: + preseq_stderr_log_file: preseq/log_file_stderr + estimates_file: preseq/estimates_file + mapped_reads: get_stat/mapped_reads + out: [estimates_file_plot_data] + island_intersect: label: "Peak annotation" doc: | diff --git a/workflows/trim-chipseq-se.cwl b/workflows/trim-chipseq-se.cwl index a4a65727..296e27d0 100644 --- a/workflows/trim-chipseq-se.cwl +++ b/workflows/trim-chipseq-se.cwl @@ -400,22 +400,21 @@ outputs: doc: "TrimGalore generated log" outputSource: trim_fastq/report_file - preseq_estimates: + preseq_estimates_plot_data: type: File? label: "Preseq estimates" format: "http://edamontology.org/format_3475" doc: "Preseq estimated results" - outputSource: preseq/estimates_file + outputSource: preseq_plot_data/estimates_file_plot_data 'sd:visualPlugins': - - scatter: + - line: tab: 'QC Plots' - Title: 'Preseq Estimates' - xAxisTitle: 'Total reads count' - yAxisTitle: 'Expected distinct reads count' - colors: ["#4b78a3"] + Title: 'Distinct Read Counts Estimates' + xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' + yAxisTitle: 'Distinct Reads Count' + colors: ["#4b78a3", "#a3514b"] height: 500 - data: [$1, $2] - comparable: "preseq" + data: [$2, $5] estimated_fragment_size: type: int @@ -547,6 +546,12 @@ steps: threads: threads out: [deduplicated_bam_bai_pair] + clean_sam_headers_for_preseq: + run: ../tools/samtools-clean-headers.cwl + in: + bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair + out: [preseq_bam] + preseq: label: "Sequencing depth estimation" doc: | @@ -554,10 +559,10 @@ steps: be expected from the additional sequencing of the same experiment. run: ../tools/preseq-lc-extrap.cwl in: - bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair + bam_file: clean_sam_headers_for_preseq/preseq_bam extrapolation: default: 1000000000 - out: [estimates_file] + out: [estimates_file, log_file_stdout, log_file_stderr] samtools_remove_duplicates: run: ../tools/samtools-markdup.cwl @@ -656,6 +661,18 @@ steps: preseq_results: preseq/estimates_file out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] + preseq_plot_data: + label: "Formats sequencing depth estimation data for plotting" + doc: | + Formats estimates file from preseq standard output for QC plotting. This adds a new + column that includes the actual read count point on the plot. + run: ../tools/preseq-plot-data.cwl + in: + preseq_stderr_log_file: preseq/log_file_stderr + estimates_file: preseq/estimates_file + mapped_reads: get_stat/mapped_reads + out: [estimates_file_plot_data] + island_intersect: label: "Peak annotation" doc: | From 0339d9ec84de14f925407d0fd934b5e3dfa861c5 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 19 Sep 2023 17:30:29 -0400 Subject: [PATCH 085/162] Update old heatmap workflow to not fail if BAM files have identical names --- tools/heatmap-prepare.cwl | 7 +++++++ tools/homer-make-tag-directory.cwl | 19 +++++++++++++++---- workflows/heatmap.cwl | 26 ++++++++++++++------------ 3 files changed, 36 insertions(+), 16 deletions(-) diff --git a/tools/heatmap-prepare.cwl b/tools/heatmap-prepare.cwl index 484ea1ae..34effa44 100644 --- a/tools/heatmap-prepare.cwl +++ b/tools/heatmap-prepare.cwl @@ -15,6 +15,11 @@ inputs: label: "BAM files" doc: "Array of input BAM files" + output_folder: + type: string[] + label: "BAM file names" + doc: "Array of names for output folders" + fragment_size: type: int[] label: "Fragment sizes" @@ -41,10 +46,12 @@ steps: run: ../tools/homer-make-tag-directory.cwl in: bam_file: bam_file + output_folder: output_folder fragment_size: fragment_size total_reads: total_reads scatter: - bam_file + - output_folder - fragment_size - total_reads scatterMethod: dotproduct diff --git a/tools/homer-make-tag-directory.cwl b/tools/homer-make-tag-directory.cwl index 8f743d96..3acbb23f 100644 --- a/tools/homer-make-tag-directory.cwl +++ b/tools/homer-make-tag-directory.cwl @@ -3,7 +3,6 @@ class: CommandLineTool requirements: -- class: InlineJavascriptRequirement - class: InitialWorkDirRequirement listing: | ${ @@ -14,7 +13,15 @@ requirements: "writable": true} ] } - +- class: InlineJavascriptRequirement + expressionLib: + - var default_output_folder = function() { + if (inputs.output_folder){ + return inputs.output_folder.replace(/\t|\s|\[|\]|\>|\<|,|\./g, "_"); + } else { + return inputs.bam_file.basename.split('.')[0]; + } + }; hints: - class: DockerRequirement @@ -27,6 +34,10 @@ inputs: type: File doc: "Alignment file, BAM" + output_folder: + type: string? + doc: "Name of the directory to save outputs" + fragment_size: type: - "null" @@ -80,14 +91,14 @@ outputs: output_tag_folder: type: Directory outputBinding: - glob: $(inputs.bam_file.basename.split('.')[0]) + glob: $(default_output_folder()) doc: "Tag directory" baseCommand: ["makeTagDirectory"] arguments: - - valueFrom: $(inputs.bam_file.basename.split('.')[0]) + - valueFrom: $(default_output_folder()) - valueFrom: $("default/" + inputs.bam_file.basename) diff --git a/workflows/heatmap.cwl b/workflows/heatmap.cwl index b89b9ef4..2a1b11d6 100644 --- a/workflows/heatmap.cwl +++ b/workflows/heatmap.cwl @@ -14,12 +14,15 @@ requirements: 'sd:upstream': chipseq_sample: - - "chipseq-se.cwl" - - "chipseq-pe.cwl" - - "trim-chipseq-se.cwl" - - "trim-chipseq-pe.cwl" - - "trim-atacseq-se.cwl" - - "trim-atacseq-pe.cwl" + - "chipseq-se.cwl" + - "chipseq-pe.cwl" + - "trim-chipseq-se.cwl" + - "trim-chipseq-pe.cwl" + - "trim-atacseq-se.cwl" + - "trim-atacseq-pe.cwl" + filtered_experiment: + - "filter-peaks-for-heatmap.cwl" + - "filter-deseq-for-heatmap.cwl" inputs: @@ -39,9 +42,7 @@ inputs: 'sd:localLabel': true alignment_name: - type: - - "null" - - string[] + type: string[] label: "ChIP-Seq experiment(s)" doc: "Names for input alignment files. Order corresponds to the alignment_file" 'sd:upstreamSource': "chipseq_sample/alias" @@ -49,12 +50,12 @@ inputs: regions_file: type: File format: "http://edamontology.org/format_3003" - label: | - "Regions of interest. Formatted as headerless BED file with [chrom start end name score strand] for gene list and - [chrom start end name] for peak file. [name] should be unique, [score] is ignored" + label: "Filter ChIP/ATAC peaks or filter DESeq genes experiment" doc: | "Regions of interest. Formatted as headerless BED file with [chrom start end name score strand] for gene list and [chrom start end name] for peak file. [name] should be unique, [score] is ignored" + 'sd:upstreamSource': "filtered_experiment/filtered_file" + 'sd:localLabel': true recentering: type: @@ -153,6 +154,7 @@ steps: run: ../tools/heatmap-prepare.cwl in: bam_file: alignment_file + output_folder: alignment_name fragment_size: fragment_size total_reads: mapped_reads_number out: [tag_folder] From b21cc29d110047b2ae64028afaa433d43dccbed7 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 20 Sep 2023 11:30:08 -0400 Subject: [PATCH 086/162] Update DESeq filtering pipeline to support inputs from DESeq Multi Factor --- workflows/filter-deseq-for-heatmap.cwl | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/workflows/filter-deseq-for-heatmap.cwl b/workflows/filter-deseq-for-heatmap.cwl index fbb9d281..b8909ce1 100644 --- a/workflows/filter-deseq-for-heatmap.cwl +++ b/workflows/filter-deseq-for-heatmap.cwl @@ -37,8 +37,8 @@ inputs: doc: "Filtering parameters (WHERE parameters for SQL query)" 'sd:filtering': params: - columns: ["feature", "RefseqId", "GeneId", "Chrom", "TxStart", "TxEnd", "Strand", "RpkmCondition1", "RpkmCondition2", "baseMean", "log2FoldChange", "pvalue", "padj", "HCL", "HCL.1", "HCL.2", "HCL.3"] - types: ["string", "string", "string", "string", "number", "number", "string", "number", "number", "number", "number", "number", "number", "string", "string", "string", "string"] + columns: ["RefseqId", "GeneId", "Chrom", "TxStart", "TxEnd", "Strand", "RpkmCondition1", "RpkmCondition2", "baseMean", "log2FoldChange", "pvalue", "padj", "HCL", "[HCL.1]", "[HCL.2]", "[HCL.3]"] + types: ["string", "string", "string", "number", "number", "string", "number", "number", "number", "number", "number", "number", "string", "string", "string", "string"] header: type: boolean? @@ -93,10 +93,27 @@ outputs: steps: + rename_column: + run: ../tools/custom-bash.cwl + in: + input_file: feature_file + script: + default: | + cat $0 | grep -v "log2FoldChange" > wo_header.tsv + HEADER=`head -n 1 $0`; + if [[ "$HEADER" != *"GeneId"* ]]; + then + HEADER="${HEADER//feature/GeneId}" + fi + echo -e "${HEADER}" > `basename $0` + cat wo_header.tsv >> `basename $0` + rm -f wo_header.tsv + out: [output_file] + feature_select: run: ../tools/feature-select-sql.cwl in: - feature_file: feature_file + feature_file: rename_column/output_file sql_query: sql_query columns: source: columns From 456800c9820e3fe4a581e37c2b7ba99754744842 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 26 Sep 2023 14:06:31 -0400 Subject: [PATCH 087/162] Update RNA trajectory workflow to plot gene expression --- tools/sc-atac-cluster.cwl | 2 +- tools/sc-atac-coverage.cwl | 2 +- tools/sc-atac-dbinding.cwl | 2 +- tools/sc-atac-reduce.cwl | 2 +- tools/sc-ctype-assign.cwl | 2 +- tools/sc-multiome-filter.cwl | 2 +- tools/sc-rna-cluster.cwl | 2 +- tools/sc-rna-da-cells.cwl | 2 +- tools/sc-rna-de-pseudobulk.cwl | 2 +- tools/sc-rna-filter.cwl | 2 +- tools/sc-rna-reduce.cwl | 2 +- tools/sc-rna-trajectory.cwl | 103 +++++++++++++++++++++++++++++++- tools/sc-triangulate.cwl | 2 +- tools/sc-vdj-profile.cwl | 2 +- tools/sc-wnn-cluster.cwl | 2 +- workflows/sc-rna-trajectory.cwl | 97 +++++++++++++++++++++++++++++- 16 files changed, 212 insertions(+), 16 deletions(-) diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index 6b4800cc..ad5aaf92 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.29 + dockerPull: biowardrobe2/sc-tools:v0.0.30 inputs: diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index 95dc12e5..cde4a118 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.29 + dockerPull: biowardrobe2/sc-tools:v0.0.30 inputs: diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index 77e0ac96..233ac4fd 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.29 + dockerPull: biowardrobe2/sc-tools:v0.0.30 inputs: diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index 500dc9a6..5be0292d 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.29 + dockerPull: biowardrobe2/sc-tools:v0.0.30 inputs: diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index dcfff3e7..f70ced74 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.29 + dockerPull: biowardrobe2/sc-tools:v0.0.30 inputs: diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index fa42a47f..22f8c4ae 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.29 + dockerPull: biowardrobe2/sc-tools:v0.0.30 inputs: diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index 3fbfc7a6..5c19781a 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.29 + dockerPull: biowardrobe2/sc-tools:v0.0.30 inputs: diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index 5818eb34..e2acb085 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.29 + dockerPull: biowardrobe2/sc-tools:v0.0.30 inputs: diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index 1b1ba60f..a807040d 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.29 + dockerPull: biowardrobe2/sc-tools:v0.0.30 inputs: diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index cb26b8d7..35bd9679 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.29 + dockerPull: biowardrobe2/sc-tools:v0.0.30 inputs: diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index d8430d02..9b026f90 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.29 + dockerPull: biowardrobe2/sc-tools:v0.0.30 inputs: diff --git a/tools/sc-rna-trajectory.cwl b/tools/sc-rna-trajectory.cwl index 0efcf0fc..7f088783 100644 --- a/tools/sc-rna-trajectory.cwl +++ b/tools/sc-rna-trajectory.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.29 + dockerPull: biowardrobe2/sc-tools:v0.0.30 inputs: @@ -83,6 +83,17 @@ inputs: Number of the most predictive genes to be shows on the gene expression heatmap. Default: 50 + genes_of_interest: + type: + - "null" + - string + - string[] + inputBinding: + prefix: "--genes" + doc: | + Genes of interest to build genes expression plots. + Default: None + export_pdf_plots: type: boolean? inputBinding: @@ -305,6 +316,94 @@ outputs: Gene expression heatmap. PDF format + xpr_pstm_plot_png: + type: File? + outputBinding: + glob: "*_xpr_pstm.png" + doc: | + Gene expression along pseudotime. + PNG format + + xpr_pstm_plot_pdf: + type: File? + outputBinding: + glob: "*_xpr_pstm.pdf" + doc: | + Gene expression along pseudotime. + PDF format + + pstm_dnst_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_pstm_dnst_spl_idnt.png" + doc: | + Pseudotime density, split by dataset + PNG format + + pstm_dnst_spl_idnt_plot_pdf: + type: File? + outputBinding: + glob: "*_pstm_dnst_spl_idnt.pdf" + doc: | + Pseudotime density, split by dataset + PDF format + + pstm_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_pstm_dnst_spl_cnd.png" + doc: | + Pseudotime density, split by + grouping condition + PNG format + + pstm_dnst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_pstm_dnst_spl_cnd.pdf" + doc: | + Pseudotime density, split by + grouping condition + PDF format + + pstm_hist_gr_clst_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_pstm_hist_gr_clst_spl_idnt.png" + doc: | + Pseudotime histogram, + colored by cluster, + split by dataset + PNG format + + pstm_hist_gr_clst_spl_idnt_plot_pdf: + type: File? + outputBinding: + glob: "*_pstm_hist_gr_clst_spl_idnt.pdf" + doc: | + Pseudotime histogram, + colored by cluster, + split by dataset + PDF format + + pstm_hist_gr_clst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_pstm_hist_gr_clst_spl_cnd.png" + doc: | + Pseudotime histogram, colored by + cluster, split by grouping condition + PNG format + + pstm_hist_gr_clst_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_pstm_hist_gr_clst_spl_cnd.pdf" + doc: | + Pseudotime histogram, colored by + cluster, split by grouping condition + PDF format + umap_rd_rnaumap_plot_png: type: File? outputBinding: @@ -617,6 +716,8 @@ s:about: | trajectory. Default: defined automatically --ngenes NGENES Number of the most predictive genes to be shows on the gene expression heatmap. Default: 50 + --genes Genes of interest to build genes expression plots. + Default: None --pdf Export plots in PDF. Default: false --verbose Print debug information. Default: false --h5seurat Save Seurat data to h5seurat file. Default: false diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index 47ac9f0d..c070071a 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.29 + dockerPull: biowardrobe2/sc-tools:v0.0.30 inputs: diff --git a/tools/sc-vdj-profile.cwl b/tools/sc-vdj-profile.cwl index 6c3ce928..891ad221 100644 --- a/tools/sc-vdj-profile.cwl +++ b/tools/sc-vdj-profile.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.29 + dockerPull: biowardrobe2/sc-tools:v0.0.30 inputs: diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index 0f351246..9b1f5a09 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.29 + dockerPull: biowardrobe2/sc-tools:v0.0.30 inputs: diff --git a/workflows/sc-rna-trajectory.cwl b/workflows/sc-rna-trajectory.cwl index 1b3455e2..f451de05 100644 --- a/workflows/sc-rna-trajectory.cwl +++ b/workflows/sc-rna-trajectory.cwl @@ -7,6 +7,14 @@ requirements: - class: StepInputExpressionRequirement - class: MultipleInputFeatureRequirement - class: InlineJavascriptRequirement + expressionLib: + - var split_features = function(line) { + function get_unique(value, index, self) { + return self.indexOf(value) === index && value != ""; + } + let splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; 'sd:upstream': @@ -75,6 +83,16 @@ inputs: the clusters. This value will define the trajectory starting point. + genes_of_interest: + type: string? + default: null + label: "Genes of interest" + doc: | + Comma or space separated list of + genes of interest to visualize + expression. + Default: None + barcodes_data: type: File? label: "Selected cell barcodes (optional)" @@ -228,9 +246,20 @@ outputs: Gene expression heatmap 'sd:visualPlugins': - image: - tab: 'Heatmap' + tab: 'Gene expression' Caption: 'Gene expression heatmap' + xpr_pstm_plot_png: + type: File? + outputSource: rna_trajectory/xpr_pstm_plot_png + label: "Gene expression along pseudotime" + doc: | + Gene expression along pseudotime + 'sd:visualPlugins': + - image: + tab: 'Gene expression' + Caption: 'Gene expression along pseudotime' + umap_rd_rnaumap_plot_png: type: File? outputSource: rna_trajectory/umap_rd_rnaumap_plot_png @@ -264,6 +293,30 @@ outputs: tab: 'Pseudotime' Caption: 'UMAP, colored by pseudotime, WNN' + pstm_dnst_spl_idnt_plot_png: + type: File? + outputSource: rna_trajectory/pstm_dnst_spl_idnt_plot_png + label: "Pseudotime density, split by dataset" + doc: | + Pseudotime density, split by dataset + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Pseudotime density, split by dataset' + + pstm_hist_gr_clst_spl_idnt_plot_png: + type: File? + outputSource: rna_trajectory/pstm_hist_gr_clst_spl_idnt_plot_png + label: "Pseudotime histogram, colored by cluster, split by dataset" + doc: | + Pseudotime histogram, + colored by cluster, + split by dataset + 'sd:visualPlugins': + - image: + tab: 'Per dataset' + Caption: 'Pseudotime histogram, colored by cluster, split by dataset' + umap_spl_idnt_rd_rnaumap_plot_png: type: File? outputSource: rna_trajectory/umap_spl_idnt_rd_rnaumap_plot_png @@ -300,6 +353,30 @@ outputs: tab: 'Per dataset' Caption: 'UMAP, colored by pseudotime, split by dataset, WNN' + pstm_dnst_spl_cnd_plot_png: + type: File? + outputSource: rna_trajectory/pstm_dnst_spl_cnd_plot_png + label: "Pseudotime density, split by grouping condition" + doc: | + Pseudotime density, split by + grouping condition + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Pseudotime density, split by grouping condition' + + pstm_hist_gr_clst_spl_cnd_plot_png: + type: File? + outputSource: rna_trajectory/pstm_hist_gr_clst_spl_cnd_plot_png + label: "Pseudotime histogram, colored by cluster, split by grouping condition" + doc: | + Pseudotime histogram, colored by + cluster, split by grouping condition + 'sd:visualPlugins': + - image: + tab: 'Per group' + Caption: 'Pseudotime histogram, colored by cluster, split by grouping condition' + umap_spl_cnd_rd_rnaumap_plot_png: type: File? outputSource: rna_trajectory/umap_spl_cnd_rd_rnaumap_plot_png @@ -401,6 +478,9 @@ steps: trajectory_start: source: trajectory_start valueFrom: $(self==""?null:self) # safety measure + genes_of_interest: + source: genes_of_interest + valueFrom: $(split_features(self)) predictive_genes: default: 100 verbose: @@ -433,6 +513,16 @@ steps: - tplg_plot_pdf - xpr_htmp_plot_png - xpr_htmp_plot_pdf + - xpr_pstm_plot_png + - xpr_pstm_plot_pdf + - pstm_dnst_spl_idnt_plot_png + - pstm_dnst_spl_idnt_plot_pdf + - pstm_dnst_spl_cnd_plot_png + - pstm_dnst_spl_cnd_plot_pdf + - pstm_hist_gr_clst_spl_idnt_plot_png + - pstm_hist_gr_clst_spl_idnt_plot_pdf + - pstm_hist_gr_clst_spl_cnd_plot_png + - pstm_hist_gr_clst_spl_cnd_plot_pdf - umap_rd_rnaumap_plot_png - umap_rd_rnaumap_plot_pdf - umap_rd_atacumap_plot_png @@ -470,6 +560,11 @@ steps: - rna_trajectory/dndr_pstm_plot_pdf - rna_trajectory/tplg_plot_pdf - rna_trajectory/xpr_htmp_plot_pdf + - rna_trajectory/xpr_pstm_plot_pdf + - rna_trajectory/pstm_dnst_spl_idnt_plot_pdf + - rna_trajectory/pstm_dnst_spl_cnd_plot_pdf + - rna_trajectory/pstm_hist_gr_clst_spl_idnt_plot_pdf + - rna_trajectory/pstm_hist_gr_clst_spl_cnd_plot_pdf - rna_trajectory/umap_rd_rnaumap_plot_pdf - rna_trajectory/umap_rd_atacumap_plot_pdf - rna_trajectory/umap_rd_wnnumap_plot_pdf From 31610d62beb53140f7ea746653f78dc2b11c20ab Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 12 Oct 2023 13:49:48 -0400 Subject: [PATCH 088/162] Fix bugs in DiffBind pipelines --- tools/diffbind-multi-factor.cwl | 109 ++++++++++++++++++--------- tools/diffbind.cwl | 72 +++++++++--------- workflows/diffbind-multi-factor.cwl | 112 ++++++++++++++++++++++------ workflows/diffbind.cwl | 36 +++------ 4 files changed, 209 insertions(+), 120 deletions(-) diff --git a/tools/diffbind-multi-factor.cwl b/tools/diffbind-multi-factor.cwl index b1342aaa..e1d349dc 100644 --- a/tools/diffbind-multi-factor.cwl +++ b/tools/diffbind-multi-factor.cwl @@ -4,7 +4,7 @@ class: CommandLineTool requirements: - class: DockerRequirement - dockerPull: biowardrobe2/diffbind:v0.0.15 + dockerPull: biowardrobe2/diffbind:v0.0.16 - class: InlineJavascriptRequirement - class: InitialWorkDirRequirement listing: | @@ -66,12 +66,11 @@ inputs: TSV/CSV metadata file to describe datasets provided in --alignments and --peaks parameters. First column should have the name 'sample', all other columns names should be selected from the following list: - Tissue, Factor, Condition, Treatment, Caller, Replicate. The values - from the 'sample' column should correspond to the values provided in - --aliases parameter. For a proper --contrast intepretation, values - defined in each metadata column should not be used in any of the other - columns. All metadata columns are treated as factors (no covariates - are supported). + Tissue, Factor, Condition, Treatment, Replicate. The values from the + 'sample' column should correspond to the values provided in --aliases + parameter. For a proper --contrast intepretation, values defined in + each metadata column should not be used in any of the other columns. + All metadata columns are treated as factors (no covariates are supported). scoreby: type: @@ -83,8 +82,8 @@ inputs: inputBinding: prefix: "--scoreby" doc: | - Score metrics to build peak overlap correlation heatmap and exclude low - quality peaks based on the threshold provided in --score parameter. + Score metrics to exclude low quality peaks based on the + threshold provided in the --score parameter. Default: pvalue score_threshold: @@ -105,6 +104,16 @@ inputs: all datasets is bigger than or equal to the provided value. Default: 1 + rec_summits: + type: int? + inputBinding: + prefix: "--summits" + doc: | + Width in bp to extend peaks around their summits in both directions + and replace the original ones. Set it to 100 bp for ATAC-Seq and 200 + bp for ChIP-Seq datasets. To skip peaks extension and replacement, set + it to negative value. Default: 200 bp (results in 401 bp wide peaks) + overlap_threshold: type: float? inputBinding: @@ -310,20 +319,38 @@ outputs: Peakset overlap rate PDF format - pk_scr_corr_plot_png: + all_pk_scr_corr_plot_png: + type: File? + outputBinding: + glob: "*_all_pk_scr_corr.png" + doc: | + Datasets correlation (all peaks) + PNG format + + all_pk_scr_corr_plot_pdf: + type: File? + outputBinding: + glob: "*_all_pk_scr_corr.pdf" + doc: | + Datasets correlation (all peaks) + PDF format + + cns_pk_scr_corr_plot_png: type: File? outputBinding: - glob: "*_pk_scr_corr.png" + glob: "*_cns_pk_scr_corr.png" doc: | - Datasets correlation (peak score) + Datasets correlation (optionally + recentered consensus peaks) PNG format - pk_scr_corr_plot_pdf: + cns_pk_scr_corr_plot_pdf: type: File? outputBinding: - glob: "*_pk_scr_corr.pdf" + glob: "*_cns_pk_scr_corr.pdf" doc: | - Datasets correlation (peak score) + Datasets correlation (optionally + recentered consensus peaks) PDF format rw_rds_corr_plot_png: @@ -528,16 +555,21 @@ doc: | s:about: | - usage: run_diffbind_manual.R - [-h] --alignments ALIGNMENTS [ALIGNMENTS ...] --peaks PEAKS [PEAKS ...] - --aliases ALIASES [ALIASES ...] --metadata METADATA - [--scoreby {pvalue,qvalue}] [--score SCORE] [--minrpkm MINRPKM] - [--minoverlap MINOVERLAP] [--groupby [GROUPBY ...]] --design DESIGN - [--contrast CONTRAST] [--base [BASE ...]] [--method {edger,deseq2}] - [--norm {auto,rle,tmm,lib}] [--padj PADJ] [--cluster {row,column,both}] - [--rowdist {cosangle,abscosangle,euclid,abseuclid,cor,abscor}] - [--columndist {cosangle,abscosangle,euclid,abseuclid,cor,abscor}] - [--center] [--pdf] [--output OUTPUT] [--cpus CPUS] + usage: run_diffbind_manual.R [-h] --alignments ALIGNMENTS [ALIGNMENTS ...] + --peaks PEAKS [PEAKS ...] --aliases ALIASES + [ALIASES ...] --metadata METADATA + [--scoreby {pvalue,qvalue}] [--score SCORE] + [--minrpkm MINRPKM] [--summits SUMMITS] + [--minoverlap MINOVERLAP] + [--groupby [GROUPBY ...]] --design DESIGN + [--contrast CONTRAST] [--base [BASE ...]] + [--method {edger,deseq2}] + [--norm {auto,rle,tmm,lib}] [--padj PADJ] + [--cluster {row,column,both}] + [--rowdist {cosangle,abscosangle,euclid,abseuclid,cor,abscor}] + [--columndist {cosangle,abscosangle,euclid,abseuclid,cor,abscor}] + [--center] [--pdf] [--output OUTPUT] + [--cpus CPUS] DiffBind Multi-factor Analysis @@ -559,24 +591,29 @@ s:about: | --alignments and --peaks parameters. First column should have the name 'sample', all other columns names should be selected from the following list: Tissue, - Factor, Condition, Treatment, Caller, Replicate. The - values from the 'sample' column should correspond to - the values provided in --aliases parameter. For a - proper --contrast intepretation, values defined in - each metadata column should not be used in any of the - other columns. All metadata columns are treated as - factors (no covariates are supported). + Factor, Condition, Treatment, Replicate. The values + from the 'sample' column should correspond to the + values provided in --aliases parameter. For a proper + --contrast intepretation, values defined in each + metadata column should not be used in any of the other + columns. All metadata columns are treated as factors + (no covariates are supported). --scoreby {pvalue,qvalue} - Score metrics to build peak overlap correlation - heatmap and exclude low quality peaks based on the - threshold provided in --score parameter. Default: - pvalue + Score metrics to exclude low quality peaks based on + the threshold provided in the --score parameter. + Default: pvalue --score SCORE Filtering threshold to keep only those peaks where the metric selected in --scoreby parameter is less than or equal to the provided value. Default: 0.05 --minrpkm MINRPKM Filtering threshold to keep only those peaks where the max RPKM for all datasets is bigger than or equal to the provided value. Default: 1 + --summits SUMMITS Width in bp to extend peaks around their summits in + both directions and replace the original ones. Set it + to 100 bp for ATAC-Seq and 200 bp for ChIP-Seq + datasets. To skip peaks extension and replacement, set + it to negative value. Default: 200 bp (results in 401 + bp wide peaks) --minoverlap MINOVERLAP Filtering threshold to keep only those peaks that are present in at least this many datasets when generating diff --git a/tools/diffbind.cwl b/tools/diffbind.cwl index d94b305c..cf608aba 100644 --- a/tools/diffbind.cwl +++ b/tools/diffbind.cwl @@ -4,7 +4,7 @@ class: CommandLineTool requirements: - class: DockerRequirement - dockerPull: biowardrobe2/diffbind:v0.0.15 + dockerPull: biowardrobe2/diffbind:v0.0.16 inputs: @@ -91,12 +91,6 @@ inputs: prefix: "-c2" doc: "Condition 2 name, single word with letters and numbers only. Default: condition_2" - fragmentsize: - type: int? - inputBinding: - prefix: "-fs" - doc: "Extend each read from its endpoint along the appropriate strand. Default: 125bp" - cutoff_value: type: float? inputBinding: @@ -113,12 +107,6 @@ inputs: prefix: "-cp" doc: "Parameter to which cutoff should be applied (fdr or pvalue). Default: fdr" - remove_duplicates: - type: boolean? - inputBinding: - prefix: "-rd" - doc: "Remove reads that map to exactly the same genomic position. Default: false" - analysis_method: type: - "null" @@ -135,11 +123,15 @@ inputs: prefix: "-mo" doc: "Min peakset overlap. Only include peaks in at least this many peaksets when generating consensus peakset. Default: 2" - min_read_counts: + rec_summits: type: int? inputBinding: - prefix: "-mc" - doc: "Min read counts. Exclude all merged intervals where the MAX raw read counts among all of the samples is smaller than the specified value. Default: 0" + prefix: "--summits" + doc: | + Width in bp to extend peaks around their summits in both directions + and replace the original ones. Set it to 100 bp for ATAC-Seq and 200 + bp for ChIP-Seq datasets. To skip peaks extension and replacement, set + it to negative value. Default: 200 bp (results in 401 bp wide peaks) use_common: type: boolean? @@ -726,19 +718,21 @@ doc: | Runs R script to compute differentially bound sites from multiple ChIP-seq experiments using affinity (quantitative) and occupancy data. s:about: | - usage: /Users/kot4or/workspaces/cwl_ws/workflows/tools/dockerfiles/scripts/run_diffbind.R - [-h] -r1 READ1 [READ1 ...] -r2 READ2 [READ2 ...] -p1 PEAK1 [PEAK1 ...] - -p2 PEAK2 [PEAK2 ...] [-n1 [NAME1 [NAME1 ...]]] - [-n2 [NAME2 [NAME2 ...]]] [-bl [BLOCK [BLOCK ...]]] - [-pf {raw,bed,narrow,macs,bayes,tpic,sicer,fp4,swembl,csv,report}] - [-c1 CONDITION1] [-c2 CONDITION2] [-fs FRAGMENTSIZE] [-rd] - [-me {edger,deseq2,all}] [-mo MINOVERLAP] [-uc] [-mc MINCOUNTS] - [-cu CUTOFF] [-cp {pvalue,fdr}] [-th THREADS] [-pa PADDING] [-o OUTPUT] + usage: run_diffbind.R [-h] -r1 READ1 [READ1 ...] -r2 READ2 [READ2 ...] -p1 + PEAK1 [PEAK1 ...] -p2 PEAK2 [PEAK2 ...] + [-n1 [NAME1 ...]] [-n2 [NAME2 ...]] [-bl [BLOCK ...]] + [-bf BLOCKFILE] + [-pf {raw,bed,narrow,macs,bayes,tpic,sicer,fp4,swembl,csv,report}] + [-c1 CONDITION1] [-c2 CONDITION2] + [-me {edger,deseq2,all}] [-mo MINOVERLAP] [-uc] + [--summits SUMMITS] [-cu CUTOFF] [-cp {pvalue,fdr}] + [-co {Reds,Greens,Blues,Greys,YlOrRd,Oranges}] + [-th THREADS] [-pa PADDING] [-o OUTPUT] Differential binding analysis of ChIP-Seq experiments using affinity (read count) data - optional arguments: + options: -h, --help show this help message and exit -r1 READ1 [READ1 ...], --read1 READ1 [READ1 ...] Read files for condition 1. Minimim 2 files in BAM @@ -752,16 +746,21 @@ s:about: | -p2 PEAK2 [PEAK2 ...], --peak2 PEAK2 [PEAK2 ...] Peak files for condition 2. Minimim 2 files in format set with -pf - -n1 [NAME1 [NAME1 ...]], --name1 [NAME1 [NAME1 ...]] + -n1 [NAME1 ...], --name1 [NAME1 ...] Sample names for condition 1. Default: basenames of -r1 without extensions - -n2 [NAME2 [NAME2 ...]], --name2 [NAME2 [NAME2 ...]] + -n2 [NAME2 ...], --name2 [NAME2 ...] Sample names for condition 2. Default: basenames of -r2 without extensions - -bl [BLOCK [BLOCK ...]], --block [BLOCK [BLOCK ...]] + -bl [BLOCK ...], --block [BLOCK ...] Blocking attribute for multi-factor analysis. Minimum 2. Either names from --name1 or/and --name2 or array of bool based on [read1]+[read2]. Default: not applied + -bf BLOCKFILE, --blockfile BLOCKFILE + Blocking attribute metadata file for multi-factor + analysis. Headerless TSV/CSV file. First column - + names from --name1 and --name2, second column - group + name. --block is ignored -pf {raw,bed,narrow,macs,bayes,tpic,sicer,fp4,swembl,csv,report}, --peakformat {raw,bed,narrow,macs,bayes,tpic,sicer,fp4,swembl,csv,report} Peak files format. One of [raw, bed, narrow, macs, bayes, tpic, sicer, fp4, swembl, csv, report]. @@ -772,11 +771,6 @@ s:about: | -c2 CONDITION2, --condition2 CONDITION2 Condition 2 name, single word with letters and numbers only. Default: condition_2 - -fs FRAGMENTSIZE, --fragmentsize FRAGMENTSIZE - Extend each read from its endpoint along the - appropriate strand. Default: 125bp - -rd, --removedup Remove reads that map to exactly the same genomic - position. Default: false -me {edger,deseq2,all}, --method {edger,deseq2,all} Method by which to analyze differential binding affinity. Default: all @@ -787,16 +781,20 @@ s:about: | -uc, --usecommon Derive consensus peaks only from the common peaks within each condition. Min peakset overlap and min read counts are ignored. Default: false - -mc MINCOUNTS, --mincounts MINCOUNTS - Min read counts. Exclude all merged intervals where - the MAX raw read counts among all of the samples is - smaller than the specified value. Default: 0 + --summits SUMMITS Width in bp to extend peaks around their summits in + both directions and replace the original ones. Set it + to 100 bp for ATAC-Seq and 200 bp for ChIP-Seq + datasets. To skip peaks extension and replacement, set + it to negative value. Default: 200 bp (results in 401 + bp wide peaks) -cu CUTOFF, --cutoff CUTOFF Cutoff for reported results. Applied to the parameter set with -cp. Default: 0.05 -cp {pvalue,fdr}, --cparam {pvalue,fdr} Parameter to which cutoff should be applied (fdr or pvalue). Default: fdr + -co {Reds,Greens,Blues,Greys,YlOrRd,Oranges}, --color {Reds,Greens,Blues,Greys,YlOrRd,Oranges} + Color scheme. Default: Greens -th THREADS, --threads THREADS Threads to use -pa PADDING, --padding PADDING diff --git a/workflows/diffbind-multi-factor.cwl b/workflows/diffbind-multi-factor.cwl index a9c32c7a..44c731c2 100644 --- a/workflows/diffbind-multi-factor.cwl +++ b/workflows/diffbind-multi-factor.cwl @@ -115,9 +115,9 @@ inputs: input samples categories. First column should have the name 'sample', all other columns names should be selected from the following list: - Tissue, Factor, Condition, Treatment, Caller, - Replicate. The values from the 'sample' column - should correspond to the names of the selected + Tissue, Factor, Condition, Treatment, Replicate. + The values from the 'sample' column should + correspond to the names of the selected ChIP-Seq/ATAC-Seq experiments. Values defined in each metadata column should not be used in any of the other columns. All metadata columns are treated @@ -231,6 +231,20 @@ inputs: peaks where the maximum RPKM for all samples is bigger than or equal to the provided value. + rec_summits: + type: int? + default: 200 + label: "Width in bp to extend peaks around summits" + doc: | + Width in bp to extend peaks around their summits + in both directions and replace the original ones. + Set it to 100 bp for ATAC-Seq and 200 bp for + ChIP-Seq datasets. To skip peaks extension and + replacement, set it to negative value. + Default: 200 bp (results in 401 bp wide peaks) + 'sd:layout': + advanced: true + promoter_dist: type: int? default: 1000 @@ -307,17 +321,6 @@ inputs: 'sd:layout': advanced: true - center_row: - type: boolean? - default: false - label: "Peak clustering. Apply row mean centering before clustering" - doc: | - Apply mean centering for normalized read counts - prior to running clustering by row. Ignored if - clustering method is not set to row or both. - 'sd:layout': - advanced: true - threads: type: - "null" @@ -415,17 +418,30 @@ outputs: tab: 'Exploratory plots' Caption: 'Peakset overlap rate' - pk_scr_corr_plot_png: + all_pk_scr_corr_plot_png: + type: File? + label: "Samples correlation (all peaks)" + doc: | + Samples correlation (all peaks) + PNG format + outputSource: diffbind/all_pk_scr_corr_plot_png + 'sd:visualPlugins': + - image: + tab: 'Exploratory plots' + Caption: 'Samples correlation (all peaks)' + + cns_pk_scr_corr_plot_png: type: File? - label: "Samples correlation (peak score)" + label: "Samples correlation (opt. rec. cons. peaks)" doc: | - Samples correlation (peak score) + Samples correlation (optionally + recentered consensus peaks) PNG format - outputSource: diffbind/pk_scr_corr_plot_png + outputSource: diffbind/cns_pk_scr_corr_plot_png 'sd:visualPlugins': - image: tab: 'Exploratory plots' - Caption: 'Samples correlation (peak score)' + Caption: 'Samples correlation (opt. rec. cons. peaks)' rw_rds_corr_plot_png: type: File? @@ -607,6 +623,14 @@ outputs: - markdownView: tab: 'Overview' + pdf_plots: + type: File + outputSource: compress_pdf_plots/compressed_folder + label: "Plots in PDF format" + doc: | + Compressed folder with plots + in PDF format + diffbind_stdout_log: type: File label: "DiffBind stdout log" @@ -714,6 +738,7 @@ steps: scoreby: scoreby score_threshold: score_threshold rpkm_threshold: rpkm_threshold + rec_summits: rec_summits overlap_threshold: overlap_threshold groupby: source: groupby @@ -735,13 +760,17 @@ steps: valueFrom: $(self=="none"?null:self) row_distance: row_distance column_distance: column_distance - center_row: center_row + center_row: + default: true + export_pdf_plots: + default: true threads: source: threads valueFrom: $(parseInt(self)) out: - pk_vrlp_s_plot_png - - pk_scr_corr_plot_png + - all_pk_scr_corr_plot_png + - cns_pk_scr_corr_plot_png - rw_rds_corr_plot_png - nr_rds_corr_plot_png - pk_prfl_plot_png @@ -749,12 +778,50 @@ steps: - diff_ma_plot_png - nr_rds_pca_1_2_plot_png - nr_rds_pca_2_3_plot_png + - pk_vrlp_s_plot_pdf + - all_pk_scr_corr_plot_pdf + - cns_pk_scr_corr_plot_pdf + - rw_rds_corr_plot_pdf + - nr_rds_corr_plot_pdf + - pk_prfl_plot_pdf + - diff_vlcn_plot_pdf + - diff_ma_plot_pdf + - nr_rds_pca_1_2_plot_pdf + - nr_rds_pca_2_3_plot_pdf - nr_rds_mds_html - diff_sts_tsv - nr_rds_gct - stdout_log - stderr_log + pdf_plots: + run: ../tools/files-to-folder.cwl + in: + input_files: + source: + - diffbind/pk_vrlp_s_plot_pdf + - diffbind/all_pk_scr_corr_plot_pdf + - diffbind/cns_pk_scr_corr_plot_pdf + - diffbind/rw_rds_corr_plot_pdf + - diffbind/nr_rds_corr_plot_pdf + - diffbind/pk_prfl_plot_pdf + - diffbind/diff_vlcn_plot_pdf + - diffbind/diff_ma_plot_pdf + - diffbind/nr_rds_pca_1_2_plot_pdf + - diffbind/nr_rds_pca_2_3_plot_pdf + valueFrom: $(self.flat().filter(n => n)) + folder_basename: + default: "pdf_plots" + out: + - folder + + compress_pdf_plots: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: pdf_plots/folder + out: + - compressed_folder + filter_columns: run: ../tools/custom-bash.cwl in: @@ -800,7 +867,7 @@ steps: input_file: restore_columns/output_file script: default: | - cat "$0" | awk -F "\t" 'NR==1 {for (i=1; i<=NF; i++) {ix[$i]=i} } NR>1 {color="255,0,0"; if ($ix["log2FoldChange"]<0) color="0,255,0"; print $ix["Chr"]"\t"$ix["Start"]"\t"$ix["End"]"\tpvalue="$ix["pvalue"]";padj="$ix["padj"]";log2FC="$ix["log2FoldChange"]"\t"1000"\t"$ix["Strand"]"\t"$ix["Start"]"\t"$ix["End"]"\t"color}' > `basename $0` + cat "$0" | awk -F "\t" 'NR==1 {for (i=1; i<=NF; i++) {ix[$i]=i} } NR>1 {color="255,0,0"; if ($ix["log2FoldChange"]<0) color="0,255,0"; print $ix["Chr"]"\t"$ix["Start"]"\t"$ix["End"]"\tpvalue="$ix["pvalue"]+0.0";padj="$ix["padj"]+0.0";log2FC="$ix["log2FoldChange"]"\t"1000"\t"$ix["Strand"]"\t"$ix["Start"]"\t"$ix["End"]"\t"color}' > `basename $0` out: - output_file @@ -943,7 +1010,6 @@ steps: echo "| :-- | --: |" >> experiment_info.md j=1 for i in "${@:$COUNT+1:$#}"; do - echo "Add $i as $count" echo "| $i | $j |" >> experiment_info.md (( j++ )) done; diff --git a/workflows/diffbind.cwl b/workflows/diffbind.cwl index cb946797..8a0ae308 100644 --- a/workflows/diffbind.cwl +++ b/workflows/diffbind.cwl @@ -195,11 +195,17 @@ inputs: doc: "Chromosome length file" 'sd:upstreamSource': "genome_indices/chrom_length" - fragmentsize: + rec_summits: type: int? - default: 125 - label: "Reads extension size, bp" - doc: "Extended each read from its endpoint along the appropriate strand. Default: 125bp" + default: 200 + label: "Width in bp to extend peaks around summits" + doc: | + Width in bp to extend peaks around their summits + in both directions and replace the original ones. + Set it to 100 bp for ATAC-Seq and 200 bp for + ChIP-Seq datasets. To skip peaks extension and + replacement, set it to negative value. + Default: 200 bp (results in 401 bp wide peaks) 'sd:layout': advanced: true @@ -227,14 +233,6 @@ inputs: 'sd:layout': advanced: true - min_read_counts: - type: int? - default: 0 - label: "Minimum read counts. Exclude intervals where MAX read counts for all samples < specified value" - doc: "Min read counts. Exclude all merged intervals where the MAX raw read counts among all of the samples is smaller than the specified value. Default: 0" - 'sd:layout': - advanced: true - use_common: type: boolean? default: false @@ -243,14 +241,6 @@ inputs: 'sd:layout': advanced: true - remove_duplicates: - type: boolean? - default: false - label: "Remove duplicated reads" - doc: "Remove reads that map to exactly the same genomic position. Default: false" - 'sd:layout': - advanced: true - cutoff_value: type: float? default: 0.05 @@ -796,13 +786,11 @@ steps: sample_names_cond_2: sample_names_cond_2 cutoff_value: cutoff_value cutoff_param: cutoff_param - fragmentsize: fragmentsize - remove_duplicates: remove_duplicates analysis_method: analysis_method blocked_attributes: blocked_attributes blocked_file: blocked_file min_overlap: min_overlap - min_read_counts: min_read_counts + rec_summits: rec_summits use_common: use_common threads: threads peakformat: @@ -965,7 +953,7 @@ steps: input_file: restore_columns/output_file script: default: | - cat "$0" | awk -F "\t" 'NR==1 {for (i=1; i<=NF; i++) {ix[$i]=i} } NR>1 {color="255,0,0"; if ($ix["Fold"]<0) color="0,255,0"; print $ix["Chr"]"\t"$ix["Start"]"\t"$ix["End"]"\tPv="$ix["p-value"]";FDR="$ix["FDR"]"\t"1000"\t"$ix["Strand"]"\t"$ix["Start"]"\t"$ix["End"]"\t"color}' > `basename $0` + cat "$0" | awk -F "\t" 'NR==1 {for (i=1; i<=NF; i++) {ix[$i]=i} } NR>1 {color="255,0,0"; if ($ix["Fold"]<0) color="0,255,0"; print $ix["Chr"]"\t"$ix["Start"]"\t"$ix["End"]"\tPv="$ix["p-value"]+0.0";FDR="$ix["FDR"]+0.0"\t"1000"\t"$ix["Strand"]"\t"$ix["Start"]"\t"$ix["End"]"\t"color}' > `basename $0` out: [output_file] sort_bed: From d484c077e420afe3eec214e5ae5d389a920f2e0f Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 23 Oct 2023 15:39:05 -0400 Subject: [PATCH 089/162] Update DESeq MFA to support multiple contrasts --- tools/deseq-multi-factor.cwl | 61 ++++++++++++++++++++++++-------- workflows/deseq-multi-factor.cwl | 20 +++++++++-- 2 files changed, 63 insertions(+), 18 deletions(-) diff --git a/tools/deseq-multi-factor.cwl b/tools/deseq-multi-factor.cwl index 26c39da8..2cb64bb9 100644 --- a/tools/deseq-multi-factor.cwl +++ b/tools/deseq-multi-factor.cwl @@ -8,7 +8,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/deseq:v0.0.5 + dockerPull: biowardrobe2/deseq:v0.0.6 inputs: @@ -70,8 +70,11 @@ inputs: doc: | Contrast to be be applied for the output, formatted as a mathematical formula of values from the --metadata table. - If not provided, the last term from the design formula will - be used. + If not provided, all possible combinations of values from + the metadata columns present in the --design but not in the + --reduced formula will be used (results will be merged giving + the priority to significantly differentially expressed genes + with higher absolute log2FoldChange values). base: type: @@ -216,6 +219,14 @@ inputs: In the exploratory visualization analysis output only features with adjusted P-value not bigger than this value. Default: 0.05 + minimum_logfc: + type: float? + inputBinding: + prefix: "--logfc" + doc: | + In the exploratory visualization analysis output only features with + absolute log2FoldChange bigger or equal to this value. Default: 0 + export_pdf_plots: type: boolean? inputBinding: @@ -365,13 +376,21 @@ doc: | s:about: | - usage: run_deseq_manual.R - [-h] --expression EXPRESSION [EXPRESSION ...] --aliases ALIASES - [ALIASES ...] --metadata METADATA --design DESIGN [--reduced REDUCED] - [--contrast CONTRAST] [--base [BASE ...]] [--type {gene,transcript}] - [--exclude [EXCLUDE ...]] [--norm {vst,rlog}] [--remove REMOVE] - [--cluster {row,column,both}] [--center] [--label [LABEL ...]] - [--padj PADJ] [--pdf] [--output OUTPUT] [--cpus CPUS] + usage: run_deseq_manual.R [-h] --expression EXPRESSION + [EXPRESSION ...] --aliases ALIASES + [ALIASES ...] --metadata METADATA + --design DESIGN [--reduced REDUCED] + [--contrast CONTRAST] + [--base [BASE ...]] + [--type {gene,transcript}] + [--exclude [EXCLUDE ...]] + [--norm {vst,rlog}] [--remove REMOVE] + [--cluster {row,column,both}] + [--rowdist {cosangle,abscosangle,euclid,abseuclid,cor,abscor}] + [--columndist {cosangle,abscosangle,euclid,abseuclid,cor,abscor}] + [--center] [--label [LABEL ...]] + [--padj PADJ] [--logfc LOGFC] [--pdf] + [--output OUTPUT] [--cpus CPUS] DESeq2 Multi-factor Analysis @@ -401,9 +420,12 @@ s:about: | Should start with ~. If provided, force DESeq2 to run LRT test instead of the Wald. --contrast CONTRAST Contrast to be be applied for the output, formatted as - a mathematical formula of values from the --metadata - table. If not provided, the last term from the design - formula will be used. + a mathematical formula of values from the --metadata table. + If not provided, all possible combinations of values from + the metadata columns present in the --design but not in the + --reduced formula will be used (results will be merged giving + the priority to significantly differentially expressed genes + with higher absolute log2FoldChange values). --base [BASE ...] Value(s) from each metadata file column(s) to be set as the base level(s). Number and order of provided values should correspond the order of columns in @@ -426,12 +448,18 @@ s:about: | --remove REMOVE Column from the metadata file to remove batch effect before running differential expression analysis. If present, all components that include this term will be - removed from the design and reduced formulas. - Default: do not remove batch effect + removed from the design and reduced formulas. Default: + do not remove batch effect --cluster {row,column,both} Hopach clustering method to be run on normalized read counts for the exploratory visualization analysis. Default: do not run clustering + --rowdist {cosangle,abscosangle,euclid,abseuclid,cor,abscor} + Distance metric for HOPACH row clustering. Ignored if + --cluster is not provided. Default: cosangle + --columndist {cosangle,abscosangle,euclid,abseuclid,cor,abscor} + Distance metric for HOPACH column clustering. Ignored + if --cluster is not provided. Default: euclid --center Apply mean centering for feature expression prior to running clustering by row. Ignored when --cluster is not row or both. Default: do not centered @@ -442,6 +470,9 @@ s:about: | --padj PADJ In the exploratory visualization analysis output only features with adjusted P-value not bigger than this value. Default: 0.05 + --logfc LOGFC In the exploratory visualization analysis output only + features with absolute log2FoldChange bigger or equal + to this value. Default: 0 --pdf Export plots in PDF. Default: false --output OUTPUT Output prefix for generated files --cpus CPUS Number of cores/cpus to use. Default: 1 \ No newline at end of file diff --git a/workflows/deseq-multi-factor.cwl b/workflows/deseq-multi-factor.cwl index 897293c0..0bfb7271 100644 --- a/workflows/deseq-multi-factor.cwl +++ b/workflows/deseq-multi-factor.cwl @@ -106,12 +106,15 @@ inputs: contrast: type: string? - label: "Contrast. If not provided, use the last term from the design formula." + label: "Contrast. If not provided, use all possible combinations" doc: | Contrast to be be applied for the output, formatted as a mathematical formula of values from the --metadata table. - If not provided, the last term from the design formula will - be used. + If not provided, all possible combinations of values from + the metadata columns present in the --design but not in the + --reduced formula will be used (results will be merged giving + the priority to significantly differentially expressed genes + with higher absolute log2FoldChange values). remove: type: string? @@ -247,6 +250,16 @@ inputs: 'sd:layout': advanced: true + minimum_logfc: + type: float? + default: 0 + label: "Minimum log2FoldChange to show features in the exploratory visualization analysis" + doc: | + In the exploratory visualization analysis output only features with + absolute log2FoldChange bigger or equal to this value. Default: 0 + 'sd:layout': + advanced: true + threads: type: - "null" @@ -448,6 +461,7 @@ steps: source: selected_features valueFrom: $(split_by_common_delim(self)) maximum_padj: maximum_padj + minimum_logfc: minimum_logfc threads: source: threads valueFrom: $(parseInt(self)) From 699fa23530f2b4fd5292d67fa126be53056b0ee1 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 13 Nov 2023 11:27:47 -0500 Subject: [PATCH 090/162] Update all sc tools to support replacing new.ident columns through the --barcodes parameter It allows to split datasets into a smaller pieces --- tools/sc-atac-cluster.cwl | 2 +- tools/sc-atac-coverage.cwl | 2 +- tools/sc-atac-dbinding.cwl | 2 +- tools/sc-atac-reduce.cwl | 2 +- tools/sc-ctype-assign.cwl | 2 +- tools/sc-multiome-filter.cwl | 2 +- tools/sc-rna-cluster.cwl | 2 +- tools/sc-rna-da-cells.cwl | 2 +- tools/sc-rna-de-pseudobulk.cwl | 2 +- tools/sc-rna-filter.cwl | 2 +- tools/sc-rna-reduce.cwl | 2 +- tools/sc-rna-trajectory.cwl | 2 +- tools/sc-triangulate.cwl | 2 +- tools/sc-vdj-profile.cwl | 2 +- tools/sc-wnn-cluster.cwl | 2 +- 15 files changed, 15 insertions(+), 15 deletions(-) diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index ad5aaf92..e9e75a8f 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.30 + dockerPull: biowardrobe2/sc-tools:v0.0.31 inputs: diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index cde4a118..f6d313a0 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.30 + dockerPull: biowardrobe2/sc-tools:v0.0.31 inputs: diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index 233ac4fd..bb946fff 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.30 + dockerPull: biowardrobe2/sc-tools:v0.0.31 inputs: diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index 5be0292d..4ad66409 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.30 + dockerPull: biowardrobe2/sc-tools:v0.0.31 inputs: diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index f70ced74..3bff1746 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.30 + dockerPull: biowardrobe2/sc-tools:v0.0.31 inputs: diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index 22f8c4ae..37c5f837 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.30 + dockerPull: biowardrobe2/sc-tools:v0.0.31 inputs: diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index 5c19781a..bcbecf55 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.30 + dockerPull: biowardrobe2/sc-tools:v0.0.31 inputs: diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index e2acb085..7814b19e 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.30 + dockerPull: biowardrobe2/sc-tools:v0.0.31 inputs: diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index a807040d..fd50cb8e 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.30 + dockerPull: biowardrobe2/sc-tools:v0.0.31 inputs: diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index 35bd9679..73fa7009 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.30 + dockerPull: biowardrobe2/sc-tools:v0.0.31 inputs: diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index 9b026f90..213b0a5d 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.30 + dockerPull: biowardrobe2/sc-tools:v0.0.31 inputs: diff --git a/tools/sc-rna-trajectory.cwl b/tools/sc-rna-trajectory.cwl index 7f088783..c359e3e0 100644 --- a/tools/sc-rna-trajectory.cwl +++ b/tools/sc-rna-trajectory.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.30 + dockerPull: biowardrobe2/sc-tools:v0.0.31 inputs: diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index c070071a..b9debd5c 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.30 + dockerPull: biowardrobe2/sc-tools:v0.0.31 inputs: diff --git a/tools/sc-vdj-profile.cwl b/tools/sc-vdj-profile.cwl index 891ad221..74bf9a15 100644 --- a/tools/sc-vdj-profile.cwl +++ b/tools/sc-vdj-profile.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.30 + dockerPull: biowardrobe2/sc-tools:v0.0.31 inputs: diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index 9b1f5a09..74190dac 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.30 + dockerPull: biowardrobe2/sc-tools:v0.0.31 inputs: From 7d9a26dd9f8a76293432fa46a8ee5340c1ffdf53 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 15 Nov 2023 13:10:53 -0500 Subject: [PATCH 091/162] When subsetting cell by values from the metadata need to split by comma If we split by both commas and spaces, we will split the columns that have spaces in the names --- workflows/sc-atac-dbinding.cwl | 9 ++++++++- workflows/sc-rna-de-pseudobulk.cwl | 13 ++++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/workflows/sc-atac-dbinding.cwl b/workflows/sc-atac-dbinding.cwl index 6329d8dc..298e3e24 100644 --- a/workflows/sc-atac-dbinding.cwl +++ b/workflows/sc-atac-dbinding.cwl @@ -15,6 +15,13 @@ requirements: let splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; return (splitted_line && !!splitted_line.length)?splitted_line:null; }; + - var split_by_comma = function(line) { + function get_unique(value, index, self) { + return self.indexOf(value) === index && value != ""; + } + var splitted_line = line?line.split(/,+/).filter(get_unique):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; 'sd:upstream': @@ -656,7 +663,7 @@ steps: valueFrom: $(self==""?null:self) # safety measure subset: source: subset - valueFrom: $(split_features(self)) + valueFrom: $(split_by_comma(self)) splitby: splitby first_cond: first_cond second_cond: second_cond diff --git a/workflows/sc-rna-de-pseudobulk.cwl b/workflows/sc-rna-de-pseudobulk.cwl index 8723b34f..22f75502 100644 --- a/workflows/sc-rna-de-pseudobulk.cwl +++ b/workflows/sc-rna-de-pseudobulk.cwl @@ -15,6 +15,13 @@ requirements: var splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; return (splitted_line && !!splitted_line.length)?splitted_line:null; }; + - var split_by_comma = function(line) { + function get_unique(value, index, self) { + return self.indexOf(value) === index && value != ""; + } + var splitted_line = line?line.split(/,+/).filter(get_unique):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; 'sd:upstream': @@ -68,8 +75,8 @@ inputs: default: null label: "Subsetting values (optional)" doc: | - Comma or space separated list of values - from the single cell metadata column + Comma separated list of values from + the single cell metadata column selected in "Subsetting category (optional)" input. Ignored if grouping category is not provided. Default: do @@ -582,7 +589,7 @@ steps: } subset: source: subset - valueFrom: $(split_features(self)) + valueFrom: $(split_by_comma(self)) splitby: source: splitby valueFrom: | From 3325e41f7102eae2625c8bf9fecdd91955d4823e Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 11 Dec 2023 14:16:43 -0500 Subject: [PATCH 092/162] Remove deprecated pipelines --- workflows/chipseq-pe.cwl | 841 ----------- workflows/chipseq-se.cwl | 717 --------- workflows/rnaseq-pe-dutp-mitochondrial.cwl | 634 -------- workflows/rnaseq-pe-dutp.cwl | 591 -------- workflows/rnaseq-pe.cwl | 544 ------- workflows/rnaseq-se-dutp-mitochondrial.cwl | 574 ------- workflows/rnaseq-se-dutp.cwl | 527 ------- workflows/rnaseq-se.cwl | 480 ------ workflows/sc-assign-cell-types.cwl | 337 ----- workflows/sc-atac-cluster.cwl | 4 + workflows/sc-atac-coverage.cwl | 50 +- workflows/sc-atac-dbinding.cwl | 50 +- workflows/sc-atac-reduce.cwl | 2 + workflows/sc-ctype-assign.cwl | 4 + workflows/sc-multiome-filter.cwl | 2 + workflows/sc-rna-cluster.cwl | 4 + workflows/sc-rna-da-cells.cwl | 50 +- workflows/sc-rna-de-pseudobulk.cwl | 4 + workflows/sc-rna-filter.cwl | 50 +- workflows/sc-rna-reduce.cwl | 4 + workflows/sc-rna-trajectory.cwl | 4 + workflows/sc-triangulate.cwl | 50 +- workflows/sc-vdj-profile.cwl | 4 + workflows/sc-wnn-cluster.cwl | 4 + workflows/sc_diff_expr.cwl | 421 ------ workflows/seurat-cluster.cwl | 1559 -------------------- 26 files changed, 101 insertions(+), 7410 deletions(-) delete mode 100644 workflows/chipseq-pe.cwl delete mode 100644 workflows/chipseq-se.cwl delete mode 100644 workflows/rnaseq-pe-dutp-mitochondrial.cwl delete mode 100644 workflows/rnaseq-pe-dutp.cwl delete mode 100644 workflows/rnaseq-pe.cwl delete mode 100644 workflows/rnaseq-se-dutp-mitochondrial.cwl delete mode 100644 workflows/rnaseq-se-dutp.cwl delete mode 100644 workflows/rnaseq-se.cwl delete mode 100644 workflows/sc-assign-cell-types.cwl delete mode 100644 workflows/sc_diff_expr.cwl delete mode 100644 workflows/seurat-cluster.cwl diff --git a/workflows/chipseq-pe.cwl b/workflows/chipseq-pe.cwl deleted file mode 100644 index 2aa5568f..00000000 --- a/workflows/chipseq-pe.cwl +++ /dev/null @@ -1,841 +0,0 @@ -cwlVersion: v1.0 -class: Workflow - -requirements: -- class: SubworkflowFeatureRequirement -- class: ScatterFeatureRequirement -- class: StepInputExpressionRequirement -- class: MultipleInputFeatureRequirement -- class: InlineJavascriptRequirement - expressionLib: - - var get_root = function(basename) { - return basename.split('.').slice(0,1).join('.'); - }; - -'sd:metadata': - - "../metadata/chipseq-header.cwl" - -'sd:upstream': - genome_indices: "genome-indices.cwl" - control_file: "chipseq-pe.cwl" - - -inputs: - - indices_folder: - type: Directory - 'sd:upstreamSource': "genome_indices/bowtie_indices" - label: "Indexed genome folder (bowtie)" - doc: "Path to indexed genome folder by **bowtie**" - - annotation_file: - type: File - 'sd:upstreamSource': "genome_indices/annotation" - label: "Annotation file" - format: "http://edamontology.org/format_3475" - doc: "Tab-separated annotation file" - - genome_size: - type: string - 'sd:upstreamSource': "genome_indices/genome_size" - label: "Effective genome size" - doc: "MACS2 effective genome size: hs, mm, ce, dm or number, for example 2.7e9" - - chrom_length: - type: File - 'sd:upstreamSource': "genome_indices/chrom_length" - label: "Chromosomes length file" - format: "http://edamontology.org/format_2330" - doc: "Chromosomes length file" - - control_file: - type: File? - default: null - 'sd:upstreamSource': "control_file/bambai_pair" - 'sd:localLabel': true - label: "Use experiment as a control" - format: "http://edamontology.org/format_2572" - doc: "Use experiment as a control for MACS2 peak calling" - - broad_peak: - type: boolean? - default: False - label: "Callpeak broad" - doc: "Set to call broad peak for MACS2" - - fastq_file_upstream: - type: - - File - - type: array - items: File - label: "FASTQ 1 input file(s)" - format: "http://edamontology.org/format_1930" - doc: "Reads data in a FASTQ format, received after paired end sequencing" - - fastq_file_downstream: - type: - - File - - type: array - items: File - label: "FASTQ 2 input file(s)" - format: "http://edamontology.org/format_1930" - doc: "Reads data in a FASTQ format, received after paired end sequencing" - - exp_fragment_size: - type: int? - default: 150 - 'sd:layout': - advanced: true - label: "Expected fragment size" - doc: "Expected fragment size for MACS2" - - force_fragment_size: - type: boolean? - default: false - 'sd:layout': - advanced: true - label: "Force fragment size" - doc: "Force MACS2 to use exp_fragment_size" - - clip_3p_end: - type: int? - default: 0 - 'sd:layout': - advanced: true - label: "Clip from 3p end" - doc: "Number of bases to clip from the 3p end" - - clip_5p_end: - type: int? - default: 0 - 'sd:layout': - advanced: true - label: "Clip from 5p end" - doc: "Number of bases to clip from the 5p end" - - remove_duplicates: - type: boolean? - default: false - 'sd:layout': - advanced: true - label: "Remove duplicates" - doc: "Calls samtools rmdup to remove duplicates from sortesd BAM file" - - peak_calling_fdr: - type: float? - default: 0.05 - 'sd:layout': - advanced: true - label: "Minimum FDR (q-value) cutoff for peak detection" - doc: | - Minimum FDR (q-value) cutoff for peak detection. -q, and - -p are mutually exclusive. - - promoter_dist: - type: int? - default: 1000 - 'sd:layout': - advanced: true - label: "Max distance from gene TSS (in both direction) overlapping which the peak will be assigned to the promoter region" - doc: "Max distance from gene TSS (in both direction) overlapping which the peak will be assigned to the promoter region" - - upstream_dist: - type: int? - default: 20000 - 'sd:layout': - advanced: true - label: "Max distance from the promoter (only in upstream direction) overlapping which the peak will be assigned to the upstream region" - doc: "Max distance from the promoter (only in upstream direction) overlapping which the peak will be assigned to the upstream region" - - threads: - type: int? - default: 2 - 'sd:layout': - advanced: true - doc: "Number of threads for those steps that support multithreading" - label: "Number of threads" - -outputs: - - unaligned_fastq: - type: - - "null" - - File[] - format: "http://edamontology.org/format_1930" - label: "Unaligned FASTQ file(s)" - doc: "Unaligned FASTQ file(s)" - outputSource: bowtie_aligner/unaligned_fastq - - multimapped_fastq: - type: - - "null" - - File[] - format: "http://edamontology.org/format_1930" - label: "Multimapped FASTQ file(s)" - doc: "Multimapped FASTQ file(s)" - outputSource: bowtie_aligner/multimapped_fastq - - bigwig: - type: File - format: "http://edamontology.org/format_3006" - label: "BigWig file" - doc: "Generated BigWig file" - outputSource: bam_to_bigwig/bigwig_file - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - type: 'wig' - name: "BigWig Track" - height: 120 - - fastx_statistics_upstream: - type: File - label: "FASTQ 1 statistics" - format: "http://edamontology.org/format_2330" - doc: "fastx_quality_stats generated FASTQ 1 quality statistics file" - outputSource: fastx_quality_stats_upstream/statistics_file - 'sd:visualPlugins': - - line: - tab: 'QC Plots' - Title: 'FASTQ 1 Base frequency plot' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Frequency' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$13, $14, $15, $16, $17] - - boxplot: - tab: 'QC Plots' - Title: 'FASTQ 1 Quality Control' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Quality score' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$11, $7, $8, $9, $12] - - fastx_statistics_downstream: - type: File - label: "FASTQ 2 statistics" - format: "http://edamontology.org/format_2330" - doc: "fastx_quality_stats generated FASTQ 2 quality statistics file" - outputSource: fastx_quality_stats_downstream/statistics_file - 'sd:visualPlugins': - - line: - tab: 'QC Plots' - Title: 'FASTQ 2 Base frequency plot' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Frequency' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$13, $14, $15, $16, $17] - - boxplot: - tab: 'QC Plots' - Title: 'FASTQ 2 Quality Control' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Quality score' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$11, $7, $8, $9, $12] - - bowtie_log: - type: File - label: "BOWTIE alignment log" - format: "http://edamontology.org/format_2330" - doc: "BOWTIE generated alignment log" - outputSource: bowtie_aligner/log_file - - iaintersect_log: - type: File - label: "Island intersect log" - format: "http://edamontology.org/format_3475" - doc: "Iaintersect generated log" - outputSource: island_intersect/log_file - - iaintersect_result: - type: File - label: "Island intersect results" - format: "http://edamontology.org/format_3475" - doc: "Iaintersect generated results" - outputSource: island_intersect/result_file - 'sd:visualPlugins': - - syncfusiongrid: - tab: 'Peak Calling' - Title: 'Islands list' - - atdp_log: - type: File - label: "ATDP log" - format: "http://edamontology.org/format_3475" - doc: "Average Tag Density generated log" - outputSource: average_tag_density/log_file - - atdp_result: - type: File - label: "ATDP results" - format: "http://edamontology.org/format_3475" - doc: "Average Tag Density generated results" - outputSource: average_tag_density/result_file - 'sd:visualPlugins': - - scatter: - tab: 'QC Plots' - Title: 'Average Tag Density' - xAxisTitle: 'Distance From TSS (bases)' - yAxisTitle: 'Average Tag Density (per bp)' - colors: ["#b3de69"] - height: 500 - data: [$1, $2] - comparable: "atdp" - - bambai_pair: - type: File - format: "http://edamontology.org/format_2572" - label: "Coordinate sorted BAM alignment file (+index BAI)" - doc: "Coordinate sorted BAM file and BAI index file" - outputSource: samtools_remove_duplicates/deduplicated_bam_bai_pair - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - optional: true - type: 'alignment' - format: 'bam' - name: "BAM Track" - displayMode: "SQUISHED" - - macs2_called_peaks: - type: File? - label: "Called peaks" - format: "http://edamontology.org/format_3468" - doc: "XLS file to include information about called peaks" - outputSource: macs2_callpeak/peak_xls_file - - macs2_narrow_peaks: - type: File? - label: "Narrow peaks" - format: "http://edamontology.org/format_3613" - doc: "Contains the peak locations together with peak summit, pvalue and qvalue" - outputSource: macs2_callpeak/narrow_peak_file - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - type: 'annotation' - name: "Narrow peaks" - displayMode: "COLLAPSE" - height: 40 - - macs2_broad_peaks: - type: File? - label: "Broad peaks" - format: "http://edamontology.org/format_3614" - doc: "Contains the peak locations together with peak summit, pvalue and qvalue" - outputSource: macs2_callpeak/broad_peak_file - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - type: 'annotation' - name: "Broad peaks" - displayMode: "COLLAPSE" - height: 40 - - macs2_peak_summits: - type: File? - label: "Peak summits" - format: "http://edamontology.org/format_3003" - doc: "Contains the peak summits locations for every peaks" - outputSource: macs2_callpeak/peak_summits_file - - macs2_moder_r: - type: File? - label: "MACS2 generated R script" - format: "http://edamontology.org/format_2330" - doc: "R script to produce a PDF image about the model based on your data" - outputSource: macs2_callpeak/moder_r_file - - macs2_gapped_peak: - type: File? - label: "Gapped peaks" - format: "http://edamontology.org/format_3586" - doc: "Contains both the broad region and narrow peaks" - outputSource: macs2_callpeak/gapped_peak_file - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - type: 'annotation' - name: "Gapped peaks" - displayMode: "COLLAPSE" - height: 40 - - macs2_log: - type: File? - label: "MACS2 log" - format: "http://edamontology.org/format_2330" - doc: "MACS2 output log" - outputSource: macs2_callpeak/macs_log - - get_stat_log: - type: File? - label: "YAML formatted combined log" - format: "http://edamontology.org/format_3750" - doc: "YAML formatted combined log" - outputSource: get_stat/collected_statistics_yaml - - get_stat_markdown: - type: File? - label: "Markdown formatted combined log" - format: "http://edamontology.org/format_3835" - doc: "Markdown formatted combined log" - outputSource: get_stat/collected_statistics_md - 'sd:visualPlugins': - - markdownView: - tab: 'Overview' - - get_stat_formatted_log: - type: File? - label: "Bowtie & Samtools Rmdup combined formatted log" - format: "http://edamontology.org/format_3475" - doc: "Processed and combined Bowtie aligner and Samtools rmdup formatted log" - outputSource: get_stat/collected_statistics_tsv - 'sd:visualPlugins': - - tableView: - vertical: true - tab: 'Overview' - 'sd:preview': - 'sd:visualPlugins': - - pie: - colors: ['#b3de69', '#99c0db', '#fb8072', '#fdc381'] - data: [$2, $3, $4, $5] - - bam_statistics_report: - type: File - label: "BAM statistics report (original)" - format: "http://edamontology.org/format_2330" - doc: "BAM statistics report (right after alignment and sorting)" - outputSource: get_bam_statistics/log_file - - bam_statistics_report_after_filtering: - type: File - label: "BAM statistics report (after filtering)" - format: "http://edamontology.org/format_2330" - doc: "BAM statistics report (after all filters applied)" - outputSource: get_bam_statistics_after_filtering/log_file - - insert_size_report_after_filtering: - type: File - label: "Insert size distribution report (after filtering)" - format: "http://edamontology.org/format_3475" - doc: "Insert size distribution report (after all filters applied)" - outputSource: get_bam_statistics_after_filtering/ext_is_section - 'sd:visualPlugins': - - scatter: - tab: 'QC Plots' - Title: 'Insert Size Distribution (after filtering)' - xAxisTitle: 'Insert size' - yAxisTitle: 'Pairs total' - colors: ["#4b78a3"] - height: 500 - data: [$1, $2] - comparable: "isdp" - - macs2_fragment_stat: - type: File? - label: "FRAGMENT, FRAGMENTE, ISLANDS" - format: "http://edamontology.org/format_2330" - doc: "fragment, calculated fragment, islands count from MACS2 results" - outputSource: macs2_callpeak/macs2_stat_file - - preseq_estimates_plot_data: - type: File? - label: "Preseq estimates" - format: "http://edamontology.org/format_3475" - doc: "Preseq estimated results" - outputSource: preseq_plot_data/estimates_file_plot_data - 'sd:visualPlugins': - - line: - tab: 'QC Plots' - Title: 'Distinct Read Counts Estimates' - xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' - yAxisTitle: 'Distinct Reads Count' - colors: ["#4b78a3", "#a3514b"] - height: 500 - data: [$2, $5] - - estimated_fragment_size: - type: int - label: "Estimated fragment size" - doc: "Estimated fragment size for downstream analyses" - outputSource: macs2_callpeak/macs2_fragments_calculated - - mapped_reads_number: - type: int - label: "Mapped reads number" - doc: "Mapped reads number for downstream analyses" - outputSource: get_stat/mapped_reads - - -steps: - - extract_fastq_upstream: - label: "Loading unmapped sequence data for read 1" - doc: | - Most DNA cores and commercial NGS companies return unmapped sequence data in FASTQ format. - The data can be uploaded from users computer, downloaded directly from an ftp server of - the core facility by providing a URL or from GEO by providing SRA accession number. - run: ../tools/extract-fastq.cwl - in: - compressed_file: fastq_file_upstream - output_prefix: - default: "read_1" - out: [fastq_file] - - extract_fastq_downstream: - label: "Loading unmapped sequence data for read 2" - doc: | - Most DNA cores and commercial NGS companies return unmapped sequence data in FASTQ format. - The data can be uploaded from users computer, downloaded directly from an ftp server of - the core facility by providing a URL or from GEO by providing SRA accession number. - run: ../tools/extract-fastq.cwl - in: - compressed_file: fastq_file_downstream - output_prefix: - default: "read_2" - out: [fastq_file] - - fastx_quality_stats_upstream: - label: "Quality control of unmapped sequence data for read 1" - doc: | - Evaluates the quality of your sequence data. Provides per base quality scores as well as - base frequencies along the reads. These metrics can be used to identify whether your data - has any problems that should be taken into account in the subsequent analysis steps. - run: ../tools/fastx-quality-stats.cwl - in: - input_file: extract_fastq_upstream/fastq_file - out: [statistics_file] - - fastx_quality_stats_downstream: - label: "Quality control of unmapped sequence data for read 2" - doc: | - Evaluates the quality of your sequence data. Provides per base quality scores as well as - base frequencies along the reads. These metrics can be used to identify whether your data - has any problems that should be taken into account in the subsequent analysis steps. - run: ../tools/fastx-quality-stats.cwl - in: - input_file: extract_fastq_downstream/fastq_file - out: [statistics_file] - - bowtie_aligner: - label: "Alignment to reference genome" - doc: | - Aligns reads to the reference genome. - Reads are assumed to be mapped if they - have less than 3 mismatches. - sam_file output includes both mapped - and unmapped reads. - run: ../tools/bowtie-alignreads.cwl - in: - upstream_filelist: extract_fastq_upstream/fastq_file - downstream_filelist: extract_fastq_downstream/fastq_file - indices_folder: indices_folder - clip_3p_end: clip_3p_end - clip_5p_end: clip_5p_end - v: - default: 3 - m: - default: 1 - best: - default: true - strata: - default: true - sam: - default: true - unaligned_prefix: - default: "unaligned_reads" - multimapped_prefix: - default: "multimapped_reads" - threads: threads - q: - default: true - X: - default: 500 - out: [sam_file, log_file, unaligned_fastq, multimapped_fastq] - - samtools_sort_index: - run: ../tools/samtools-sort-index.cwl - in: - sort_input: bowtie_aligner/sam_file - threads: threads - out: [bam_bai_pair] - - samtools_mark_duplicates: - run: ../tools/samtools-markdup.cwl - in: - bam_bai_pair: samtools_sort_index/bam_bai_pair - keep_duplicates: - default: true - threads: threads - out: [deduplicated_bam_bai_pair] - - clean_sam_headers_for_preseq: - run: ../tools/samtools-clean-headers.cwl - in: - bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair - out: [preseq_bam] - - preseq: - label: "Sequencing depth estimation" - doc: | - Estimates the complexity of the sequencing library, evaluates how many reads can - be expected from the additional sequencing of the same experiment. - run: ../tools/preseq-lc-extrap.cwl - in: - bam_file: clean_sam_headers_for_preseq/preseq_bam - pe_mode: - default: true - extrapolation: - default: 1000000000 - out: [estimates_file, log_file_stdout, log_file_stderr] - - samtools_remove_duplicates: - run: ../tools/samtools-markdup.cwl - in: - bam_bai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair - keep_duplicates: - source: remove_duplicates - valueFrom: $(!self) - threads: threads - out: [deduplicated_bam_bai_pair] - - macs2_callpeak: - label: "Peak detection" - doc: | - Identifies enriched with aligned reads genome areas. Those areas correspond to the - transcription factor binding sites. - run: ../tools/macs2-callpeak-biowardrobe-only.cwl - in: - treatment_file: samtools_remove_duplicates/deduplicated_bam_bai_pair - control_file: control_file - nolambda: - source: control_file - valueFrom: $(!self) - genome_size: genome_size - mfold: - default: "4 40" - verbose: - default: 3 - nomodel: force_fragment_size - extsize: exp_fragment_size - bw: exp_fragment_size - broad: broad_peak - call_summits: - source: broad_peak - valueFrom: $(!self) - keep_dup: - default: auto - q_value: peak_calling_fdr - format_mode: - default: BAMPE - buffer_size: - default: 10000 - out: - - peak_xls_file - - narrow_peak_file - - peak_summits_file - - broad_peak_file - - moder_r_file - - gapped_peak_file - - treat_pileup_bdg_file - - control_lambda_bdg_file - - macs_log - - macs2_stat_file - - macs2_fragments_calculated - - bam_to_bigwig: - run: ../tools/bam-bedgraph-bigwig.cwl - in: - bam_file: samtools_remove_duplicates/deduplicated_bam_bai_pair - chrom_length_file: chrom_length - mapped_reads_number: get_stat/mapped_reads - pairchip: - default: true - out: [bigwig_file] - - get_bam_statistics: - label: "Quality control of aligned sequence data" - doc: | - Calculates alignment statistics, such as reads mapped/unmapped, average - read length and quality score, etc. - run: ../tools/samtools-stats.cwl - in: - bambai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair - output_filename: - source: samtools_mark_duplicates/deduplicated_bam_bai_pair - valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") - out: [log_file] - - get_bam_statistics_after_filtering: - run: ../tools/samtools-stats.cwl - in: - bambai_pair: samtools_remove_duplicates/deduplicated_bam_bai_pair - output_filename: - source: samtools_remove_duplicates/deduplicated_bam_bai_pair - valueFrom: $(get_root(self.basename)+"_bam_statistics_report_after_filtering.txt") - out: [log_file, ext_is_section, reads_mapped] - - get_stat: - run: ../tools/collect-statistics-chip-seq.cwl - in: - bowtie_alignment_report: bowtie_aligner/log_file - bam_statistics_report: get_bam_statistics/log_file - bam_statistics_after_filtering_report: get_bam_statistics_after_filtering/log_file - macs2_called_peaks: macs2_callpeak/peak_xls_file - atdp_results: average_tag_density/result_file - preseq_results: preseq/estimates_file - paired_end: - default: True - out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] - - preseq_plot_data: - label: "Formats sequencing depth estimation data for plotting" - doc: | - Formats estimates file from preseq standard output for QC plotting. This adds a new - column that includes the actual read count point on the plot. - run: ../tools/preseq-plot-data.cwl - in: - preseq_stderr_log_file: preseq/log_file_stderr - estimates_file: preseq/estimates_file - mapped_reads: get_stat/mapped_reads - out: [estimates_file_plot_data] - - island_intersect: - label: "Peak annotation" - doc: | - Assigns nearest genes to peaks to explore the biological implication of the open - chromatin binding sites. - run: ../tools/iaintersect.cwl - in: - input_filename: macs2_callpeak/peak_xls_file - annotation_filename: annotation_file - promoter_bp: promoter_dist - upstream_bp: upstream_dist - out: [result_file, log_file] - - samtools_sort_index_for_atdp: - run: ../tools/samtools-sort-index.cwl - in: - sort_input: samtools_remove_duplicates/deduplicated_bam_bai_pair - threads: threads - out: [bam_bai_pair] - - average_tag_density: - label: "Read enrichment around genes TSS" - doc: | - Generates average tag density plot around genes TSS as a lot of cis-regulatory - elements are close to the TSS of their targets. - run: ../tools/atdp.cwl - in: - input_file: samtools_sort_index_for_atdp/bam_bai_pair - annotation_filename: annotation_file - fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated - avd_window_bp: - default: 5000 - avd_smooth_bp: - default: 50 - ignore_chr: - default: chrM - double_chr: - default: "chrX chrY" - avd_heat_window_bp: - default: 200 - mapped_reads: - source: get_bam_statistics_after_filtering/reads_mapped - valueFrom: $(parseInt(self/2)) - out: [result_file, log_file] - - -$namespaces: - s: http://schema.org/ - -$schemas: -- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf - -label: "Deprecated. ChIP-Seq pipeline paired-end" -s:name: "Deprecated. ChIP-Seq pipeline paired-end" -s:alternateName: "ChIP-Seq basic analysis workflow for a paired-end experiment" - -s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/chipseq-pe.cwl -s:codeRepository: https://github.com/datirium/workflows -s:license: http://www.apache.org/licenses/LICENSE-2.0 - -s:isPartOf: - class: s:CreativeWork - s:name: Common Workflow Language - s:url: http://commonwl.org/ - -s:creator: -- class: s:Organization - s:legalName: "Cincinnati Children's Hospital Medical Center" - s:location: - - class: s:PostalAddress - s:addressCountry: "USA" - s:addressLocality: "Cincinnati" - s:addressRegion: "OH" - s:postalCode: "45229" - s:streetAddress: "3333 Burnet Ave" - s:telephone: "+1(513)636-4200" - s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" - s:department: - - class: s:Organization - s:legalName: "Allergy and Immunology" - s:department: - - class: s:Organization - s:legalName: "Barski Research Lab" - s:member: - - class: s:Person - s:name: Michael Kotliar - s:email: mailto:michael.kotliar@cchmc.org - s:sameAs: - - id: http://orcid.org/0000-0002-6486-3898 - -# doc: -# $include: ../descriptions/chipseq-pe.md - - -doc: | - The original [BioWardrobe's](https://biowardrobe.com) [PubMed ID:26248465](https://www.ncbi.nlm.nih.gov/pubmed/26248465) - **ChIP-Seq** basic analysis workflow for a **paired-end** experiment. - A [FASTQ](http://maq.sourceforge.net/fastq.shtml) input file has to be provided. - - The pipeline produces a sorted BAM file alongside with index BAI file, quality - statistics of the input FASTQ file, coverage by estimated fragments as a BigWig file, peaks calling - data in a form of narrowPeak or broadPeak files, islands with the assigned nearest genes and - region type, data for average tag density plot. - - Workflow starts with step *fastx\_quality\_stats* from FASTX-Toolkit - to calculate quality statistics for input FASTQ file. - - At the same time `bowtie` is used to align - reads from input FASTQ file to reference genome *bowtie\_aligner*. The output of this step - is an unsorted SAM file which is being sorted and indexed by `samtools sort` and `samtools index` - *samtools\_sort\_index*. - - Depending on workflow’s input parameters indexed and sorted BAM file - can be processed by `samtools markdup` *samtools\_remove\_duplicates* to get rid of duplicated reads. - - Next `macs2 callpeak` performs peak calling *macs2\_callpeak* and the next step - reports *macs2\_island\_count* the number of islands and estimated fragment size. If the latter - is less that 80bp (hardcoded in the workflow) `macs2 callpeak` is rerun again with forced fixed - fragment size value (*macs2\_callpeak\_forced*). It is also possible to force MACS2 to use pre set fragment size in the first place. - - Next step (*macs2\_stat*) is used to define which of the islands and estimated fragment size should be used - in workflow output: either from *macs2\_island\_count* step or from *macs2\_island\_count\_forced* step. If input - trigger of this step is set to True it means that *macs2\_callpeak\_forced* step was run and it returned different - from *macs2\_callpeak* step results, so *macs2\_stat* step should return [fragments\_new, fragments\_old, islands\_new], - if trigger is False the step returns [fragments\_old, fragments\_old, islands\_old], where sufix "old" defines - results obtained from *macs2\_island\_count* step and sufix "new" - from *macs2\_island\_count\_forced* step. - - The following two steps (*bamtools\_stats* and *bam\_to\_bigwig*) are used to calculate coverage from BAM file and save it in BigWig format. For that purpose bamtools stats returns the number of - mapped reads which is then used as scaling factor by bedtools genomecov when it performs coverage - calculation and saves it as a BEDgraph file whichis then sorted and converted to BigWig format by - bedGraphToBigWig tool from UCSC utilities. Step *get\_stat* is used to return a text file with statistics - in a form of [TOTAL, ALIGNED, SUPRESSED, USED] reads count. - - Step *island\_intersect* assigns nearest genes and regions to the islands obtained from *macs2\_callpeak\_forced*. - Step *average\_tag\_density* is used to calculate data for average tag density plot from the BAM file. \ No newline at end of file diff --git a/workflows/chipseq-se.cwl b/workflows/chipseq-se.cwl deleted file mode 100644 index d6f83eb5..00000000 --- a/workflows/chipseq-se.cwl +++ /dev/null @@ -1,717 +0,0 @@ -cwlVersion: v1.0 -class: Workflow - -requirements: - - class: SubworkflowFeatureRequirement - - class: ScatterFeatureRequirement - - class: StepInputExpressionRequirement - - class: MultipleInputFeatureRequirement - - class: InlineJavascriptRequirement - expressionLib: - - var get_root = function(basename) { - return basename.split('.').slice(0,1).join('.'); - }; - - -'sd:metadata': - - "../metadata/chipseq-header.cwl" - -'sd:upstream': - genome_indices: "genome-indices.cwl" - control_file: "chipseq-se.cwl" - - -inputs: - - indices_folder: - type: Directory - 'sd:upstreamSource': "genome_indices/bowtie_indices" - label: "Genome indices" - doc: "Directory with the genome indices generated by Bowtie" - - annotation_file: - type: File - 'sd:upstreamSource': "genome_indices/annotation" - label: "Genome annotation file" - format: "http://edamontology.org/format_3475" - doc: "Genome annotation file in TSV format" - - genome_size: - type: string - 'sd:upstreamSource': "genome_indices/genome_size" - label: "Effective genome size" - doc: "The length of the mappable genome (hs, mm, ce, dm or number, for example 2.7e9)" - - chrom_length: - type: File - 'sd:upstreamSource': "genome_indices/chrom_length" - label: "Chromosome lengths file" - format: "http://edamontology.org/format_2330" - doc: "Chromosome lengths file in TSV format" - - control_file: - type: File? - default: null - 'sd:upstreamSource': "control_file/bambai_pair" - 'sd:localLabel': true - label: "Control ChIP-Seq single-read experiment" - format: "http://edamontology.org/format_2572" - doc: "Indexed BAM file from the ChIP-Seq single-read experiment to be used as a control for MACS2 peak calling" - - broad_peak: - type: boolean? - default: False - # 'sd:parent': "https://raw.githubusercontent.com/datirium/workflows/master/tags/antibody-dummy.cwl" - label: "Call broad peaks" - doc: "Make MACS2 call broad peaks by linking nearby highly enriched regions" - - fastq_file: - type: - - File - - type: array - items: File - label: "FASTQ file(s)" - format: "http://edamontology.org/format_1930" - doc: "Single-read sequencing data in FASTQ format (fastq, fq, bzip2, gzip, zip)" - - exp_fragment_size: - type: int? - default: 150 - 'sd:layout': - advanced: true - label: "Expected fragment size" - doc: "Expected fragment size for read extenstion towards 3' end if force_fragment_size was set to True or if calculated by MACS2 fragment size was less that 80 bp" - - force_fragment_size: - type: boolean? - default: false - 'sd:layout': - advanced: true - label: "Force peak calling with expected fragment size" - doc: "Make MACS2 don't build the shifting model and use expected fragment size for read extenstion towards 3' end" - - clip_3p_end: - type: int? - default: 0 - 'sd:layout': - advanced: true - label: "Clip from 3' end" - doc: "Number of base pairs to clip from 3' end" - - clip_5p_end: - type: int? - default: 0 - 'sd:layout': - advanced: true - label: "Clip from 5' end" - doc: "Number of base pairs to clip from 5' end" - - remove_duplicates: - type: boolean? - default: false - 'sd:layout': - advanced: true - label: "Remove PCR duplicates" - doc: "Remove PCR duplicates from sorted BAM file" - - peak_calling_fdr: - type: float? - default: 0.05 - 'sd:layout': - advanced: true - label: "Minimum FDR (q-value) cutoff for peak detection" - doc: | - Minimum FDR (q-value) cutoff for peak detection. -q, and - -p are mutually exclusive. - - promoter_dist: - type: int? - default: 1000 - 'sd:layout': - advanced: true - label: "Max distance from gene TSS (in both direction) overlapping which the peak will be assigned to the promoter region" - doc: "Max distance from gene TSS (in both direction) overlapping which the peak will be assigned to the promoter region" - - upstream_dist: - type: int? - default: 20000 - 'sd:layout': - advanced: true - label: "Max distance from the promoter (only in upstream direction) overlapping which the peak will be assigned to the upstream region" - doc: "Max distance from the promoter (only in upstream direction) overlapping which the peak will be assigned to the upstream region" - - threads: - type: int? - default: 2 - 'sd:layout': - advanced: true - label: "Number of threads" - doc: "Number of threads for those steps that support multithreading" - - -outputs: - - unaligned_fastq: - type: - - "null" - - File[] - format: "http://edamontology.org/format_1930" - label: "Unaligned FASTQ file(s)" - doc: "Unaligned FASTQ file(s)" - outputSource: bowtie_aligner/unaligned_fastq - - multimapped_fastq: - type: - - "null" - - File[] - format: "http://edamontology.org/format_1930" - label: "Multimapped FASTQ file(s)" - doc: "Multimapped FASTQ file(s)" - outputSource: bowtie_aligner/multimapped_fastq - - bigwig: - type: File - format: "http://edamontology.org/format_3006" - label: "Genome coverage" - doc: "Genome coverage in bigWig format" - outputSource: bam_to_bigwig/bigwig_file - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - optional: true - type: 'wig' - name: "Genome Coverage" - height: 120 - - fastx_statistics: - type: File - label: "FASTQ quality statistics" - format: "http://edamontology.org/format_2330" - doc: "FASTQ quality statistics in TSV format" - outputSource: fastx_quality_stats/statistics_file - 'sd:visualPlugins': - - line: - tab: 'QC Plots' - Title: 'Base Frequency Plot' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Frequency' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$13, $14, $15, $16, $17] - - boxplot: - tab: 'QC Plots' - Title: 'Base Quality Plot' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Quality score' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$11, $7, $8, $9, $12] - - bowtie_log: - type: File - label: "Read alignment log" - format: "http://edamontology.org/format_2330" - doc: "Read alignment log file from Bowtie" - outputSource: bowtie_aligner/log_file - - iaintersect_result: - type: File - label: "Gene annotated peaks" - format: "http://edamontology.org/format_3475" - doc: "MACS2 peak file annotated with nearby genes" - outputSource: island_intersect/result_file - 'sd:visualPlugins': - - syncfusiongrid: - tab: 'Peak Calling' - Title: 'Peak Coordinates' - - atdp_result: - type: File - label: "Average Tag Density Plot" - format: "http://edamontology.org/format_3475" - doc: "Average Tag Density Plot file in TSV format" - outputSource: average_tag_density/result_file - 'sd:visualPlugins': - - scatter: - tab: 'QC Plots' - Title: 'Average Tag Density Plot' - xAxisTitle: 'Distance From TSS (bp)' - yAxisTitle: 'Average Tag Density (per bp)' - colors: ["#b3de69"] - height: 500 - data: [$1, $2] - comparable: "atdp" - - bambai_pair: - type: File - format: "http://edamontology.org/format_2572" - label: "Aligned reads" - doc: "Coordinate sorted BAM alignment and index BAI files" - outputSource: samtools_remove_duplicates/deduplicated_bam_bai_pair - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - type: 'alignment' - format: 'bam' - name: "Nucleotide Sequence Alignments" - displayMode: "SQUISHED" - - macs2_called_peaks: - type: File - label: "Called peaks" - format: "http://edamontology.org/format_3468" - doc: "Called peaks file with 1-based coordinates in XLS format" - outputSource: macs2_callpeak/peak_xls_file - - macs2_narrow_peaks: - type: File? - label: "Narrow peaks" - format: "http://edamontology.org/format_3613" - doc: "Called peaks file in ENCODE narrow peak format" - outputSource: macs2_callpeak/narrow_peak_file - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - type: 'annotation' - name: "Narrow peaks" - displayMode: "COLLAPSE" - height: 40 - - macs2_broad_peaks: - type: File? - label: "Broad peaks" - format: "http://edamontology.org/format_3614" - doc: "Called peaks file in ENCODE broad peak format" - outputSource: macs2_callpeak/broad_peak_file - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - type: 'annotation' - name: "Broad peaks" - displayMode: "COLLAPSE" - height: 40 - - workflow_statistics_yaml: - type: File? - label: "YAML formatted combined log" - format: "http://edamontology.org/format_3750" - doc: "YAML formatted combined log" - outputSource: get_stat/collected_statistics_yaml - - workflow_statistics_markdown: - type: File? - label: "Markdown formatted combined log" - format: "http://edamontology.org/format_3835" - doc: "Markdown formatted combined log" - outputSource: get_stat/collected_statistics_md - 'sd:visualPlugins': - - markdownView: - tab: 'Overview' - - workflow_statistics_tsv: - type: File - label: "Workflow execution statistics" - format: "http://edamontology.org/format_3475" - doc: "Overall workflow execution statistics from bowtie_aligner and samtools_rmdup steps" - outputSource: get_stat/collected_statistics_tsv - 'sd:visualPlugins': - - tableView: - vertical: true - tab: 'Overview' - 'sd:preview': - 'sd:visualPlugins': - - pie: - colors: ['#b3de69', '#99c0db', '#fb8072', '#fdc381'] - data: [$2, $3, $4, $5] - - bam_statistics_report: - type: File - label: "BAM statistics report (original)" - format: "http://edamontology.org/format_2330" - doc: "BAM statistics report (right after alignment and sorting)" - outputSource: get_bam_statistics/log_file - - bam_statistics_report_after_filtering: - type: File - label: "BAM statistics report (after filtering)" - format: "http://edamontology.org/format_2330" - doc: "BAM statistics report (after all filters applied)" - outputSource: get_bam_statistics_after_filtering/log_file - - preseq_estimates_plot_data: - type: File? - label: "Preseq estimates" - format: "http://edamontology.org/format_3475" - doc: "Preseq estimated results" - outputSource: preseq_plot_data/estimates_file_plot_data - 'sd:visualPlugins': - - line: - tab: 'QC Plots' - Title: 'Distinct Read Counts Estimates' - xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' - yAxisTitle: 'Distinct Reads Count' - colors: ["#4b78a3", "#a3514b"] - height: 500 - data: [$2, $5] - - estimated_fragment_size: - type: int - label: "Estimated fragment size" - doc: "Estimated fragment size for downstream analyses" - outputSource: macs2_callpeak/macs2_fragments_calculated - - mapped_reads_number: - type: int - label: "Mapped reads number" - doc: "Mapped reads number for downstream analyses" - outputSource: get_stat/mapped_reads - - -steps: - - extract_fastq: - label: "Loading unmapped sequence data" - doc: | - Most DNA cores and commercial NGS companies return unmapped sequence data in FASTQ format. - The data can be uploaded from users computer, downloaded directly from an ftp server of - the core facility by providing a URL or from GEO by providing SRA accession number. - run: ../tools/extract-fastq.cwl - in: - compressed_file: fastq_file - output_prefix: - default: "read_1" - out: [fastq_file] - - fastx_quality_stats: - label: "Quality control of unmapped sequence data" - doc: | - Evaluates the quality of your sequence data. Provides per base quality scores as well as - base frequencies along the reads. These metrics can be used to identify whether your data - has any problems that should be taken into account in the subsequent analysis steps. - run: ../tools/fastx-quality-stats.cwl - in: - input_file: extract_fastq/fastq_file - out: [statistics_file] - - bowtie_aligner: - label: "Alignment to reference genome" - doc: | - Aligns reads to the reference genome. - Reads are assumed to be mapped if they - have less than 3 mismatches. - sam_file output includes both mapped - and unmapped reads. - run: ../tools/bowtie-alignreads.cwl - in: - upstream_filelist: extract_fastq/fastq_file - indices_folder: indices_folder - clip_3p_end: clip_3p_end - clip_5p_end: clip_5p_end - v: - default: 3 - m: - default: 1 - best: - default: true - strata: - default: true - sam: - default: true - unaligned_prefix: - default: "unaligned_reads" - multimapped_prefix: - default: "multimapped_reads" - threads: threads - q: - default: true - X: - default: 500 - out: - - sam_file - - log_file - - unaligned_fastq - - multimapped_fastq - - samtools_sort_index: - run: ../tools/samtools-sort-index.cwl - in: - sort_input: bowtie_aligner/sam_file - threads: threads - out: [bam_bai_pair] - - samtools_mark_duplicates: - run: ../tools/samtools-markdup.cwl - in: - bam_bai_pair: samtools_sort_index/bam_bai_pair - keep_duplicates: - default: true - threads: threads - out: [deduplicated_bam_bai_pair] - - clean_sam_headers_for_preseq: - run: ../tools/samtools-clean-headers.cwl - in: - bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair - out: [preseq_bam] - - preseq: - label: "Sequencing depth estimation" - doc: | - Estimates the complexity of the sequencing library, evaluates how many reads can - be expected from the additional sequencing of the same experiment. - run: ../tools/preseq-lc-extrap.cwl - in: - bam_file: clean_sam_headers_for_preseq/preseq_bam - extrapolation: - default: 1000000000 - out: [estimates_file, log_file_stdout, log_file_stderr] - - samtools_remove_duplicates: - run: ../tools/samtools-markdup.cwl - in: - bam_bai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair - keep_duplicates: - source: remove_duplicates - valueFrom: $(!self) - threads: threads - out: [deduplicated_bam_bai_pair] - - macs2_callpeak: - label: "Peak detection" - doc: | - Identifies enriched with aligned reads genome areas. Those areas correspond to the - transcription factor binding sites. - run: ../tools/macs2-callpeak-biowardrobe-only.cwl - in: - treatment_file: samtools_remove_duplicates/deduplicated_bam_bai_pair - control_file: control_file - nolambda: - source: control_file - valueFrom: $(!self) - genome_size: genome_size - mfold: - default: "4 40" - verbose: - default: 3 - nomodel: force_fragment_size - extsize: exp_fragment_size - bw: exp_fragment_size - broad: broad_peak - call_summits: - source: broad_peak - valueFrom: $(!self) - keep_dup: - default: auto - q_value: peak_calling_fdr - format_mode: - default: BAM - buffer_size: - default: 10000 - out: - - peak_xls_file - - narrow_peak_file - - broad_peak_file - - macs2_fragments_calculated - - bam_to_bigwig: - run: ../tools/bam-bedgraph-bigwig.cwl - in: - bam_file: samtools_remove_duplicates/deduplicated_bam_bai_pair - chrom_length_file: chrom_length - mapped_reads_number: get_stat/mapped_reads - fragment_size: macs2_callpeak/macs2_fragments_calculated - out: [bigwig_file] - - get_bam_statistics: - label: "Quality control of aligned sequence data" - doc: | - Calculates alignment statistics, such as reads mapped/unmapped, average - read length and quality score, etc. - run: ../tools/samtools-stats.cwl - in: - bambai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair - output_filename: - source: samtools_mark_duplicates/deduplicated_bam_bai_pair - valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") - out: [log_file] - - get_bam_statistics_after_filtering: - run: ../tools/samtools-stats.cwl - in: - bambai_pair: samtools_remove_duplicates/deduplicated_bam_bai_pair - output_filename: - source: samtools_remove_duplicates/deduplicated_bam_bai_pair - valueFrom: $(get_root(self.basename)+"_bam_statistics_report_after_filtering.txt") - out: [log_file, reads_mapped] - - get_stat: - run: ../tools/collect-statistics-chip-seq.cwl - in: - bowtie_alignment_report: bowtie_aligner/log_file - bam_statistics_report: get_bam_statistics/log_file - bam_statistics_after_filtering_report: get_bam_statistics_after_filtering/log_file - macs2_called_peaks: macs2_callpeak/peak_xls_file - atdp_results: average_tag_density/result_file - preseq_results: preseq/estimates_file - out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] - - preseq_plot_data: - label: "Formats sequencing depth estimation data for plotting" - doc: | - Formats estimates file from preseq standard output for QC plotting. This adds a new - column that includes the actual read count point on the plot. - run: ../tools/preseq-plot-data.cwl - in: - preseq_stderr_log_file: preseq/log_file_stderr - estimates_file: preseq/estimates_file - mapped_reads: get_stat/mapped_reads - out: [estimates_file_plot_data] - - island_intersect: - label: "Peak annotation" - doc: | - Assigns nearest genes to peaks to explore the biological implication of the open - chromatin binding sites. - run: ../tools/iaintersect.cwl - in: - input_filename: macs2_callpeak/peak_xls_file - annotation_filename: annotation_file - promoter_bp: promoter_dist - upstream_bp: upstream_dist - out: [result_file] - - samtools_sort_index_for_atdp: - run: ../tools/samtools-sort-index.cwl - in: - sort_input: samtools_remove_duplicates/deduplicated_bam_bai_pair - threads: threads - out: [bam_bai_pair] - - average_tag_density: - label: "Read enrichment around genes TSS" - doc: | - Generates average tag density plot around genes TSS as a lot of cis-regulatory - elements are close to the TSS of their targets. - run: ../tools/atdp.cwl - in: - input_file: samtools_sort_index_for_atdp/bam_bai_pair - annotation_filename: annotation_file - fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated - avd_window_bp: - default: 5000 - avd_smooth_bp: - default: 50 - ignore_chr: - default: chrM - double_chr: - default: "chrX chrY" - avd_heat_window_bp: - default: 200 - mapped_reads: - source: get_bam_statistics_after_filtering/reads_mapped - valueFrom: $(parseInt(self)) - out: [result_file] - - -$namespaces: - s: http://schema.org/ - -$schemas: -- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf - -label: "Deprecated. ChIP-Seq pipeline single-read" -s:name: "Deprecated. ChIP-Seq pipeline single-read" -s:alternateName: "ChIP-Seq basic analysis workflow for single-read data" - -s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/chipseq-se.cwl -s:codeRepository: https://github.com/datirium/workflows -s:license: http://www.apache.org/licenses/LICENSE-2.0 - -s:isPartOf: - class: s:CreativeWork - s:name: Common Workflow Language - s:url: http://commonwl.org/ - -s:creator: -- class: s:Organization - s:legalName: "Cincinnati Children's Hospital Medical Center" - s:location: - - class: s:PostalAddress - s:addressCountry: "USA" - s:addressLocality: "Cincinnati" - s:addressRegion: "OH" - s:postalCode: "45229" - s:streetAddress: "3333 Burnet Ave" - s:telephone: "+1(513)636-4200" - s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" - s:department: - - class: s:Organization - s:legalName: "Allergy and Immunology" - s:department: - - class: s:Organization - s:legalName: "Barski Research Lab" - s:member: - - class: s:Person - s:name: Michael Kotliar - s:email: mailto:michael.kotliar@cchmc.org - s:sameAs: - - id: http://orcid.org/0000-0002-6486-3898 - -# doc: -# $include: ../descriptions/chipseq-se.md - - -doc: | - # ChIP-Seq basic analysis workflow for single-read data - - Reads are aligned to the reference genome with [Bowtie](http://bowtie-bio.sourceforge.net/index.shtml). Results are saved as coordinate sorted [BAM](http://samtools.github.io/hts-specs/SAMv1.pdf) alignment and index BAI files. Optionally, PCR duplicates can be removed. To obtain coverage in [bigWig](https://genome.ucsc.edu/goldenpath/help/bigWig.html) format, average fragment length is calculated by [MACS2](https://github.com/taoliu/MACS), and individual reads are extended to this length in the 3’ direction. Areas of enrichment identified by MACS2 are saved in ENCODE [narrow peak](http://genome.ucsc.edu/FAQ/FAQformat.html#format12) or [broad peak](https://genome.ucsc.edu/FAQ/FAQformat.html#format13) formats. Called peaks together with the nearest genes are saved in TSV format. In addition to basic statistics (number of total/mapped/multi-mapped/unmapped/duplicate reads), pipeline generates several quality control measures. Base frequency plots are used to estimate adapter contamination, a frequent occurrence in low-input ChIP-Seq experiments. Expected distinct reads count from [Preseq](http://smithlabresearch.org/software/preseq/) can be used to estimate read redundancy for a given sequencing depth. Average tag density profiles can be used to estimate ChIP enrichment for promoter proximal histone modifications. Use of different parameters for different antibodies (calling broad or narrow peaks) is possible. Additionally, users can elect to use BAM file from another experiment as control for MACS2 peak calling. - - ## Cite as - - *Kartashov AV, Barski A. BioWardrobe: an integrated platform for analysis of epigenomics and transcriptomics data. Genome Biol. 2015;16(1):158. Published 2015 Aug 7. [doi:10.1186/s13059-015-0720-3](https://www.ncbi.nlm.nih.gov/pubmed/26248465)* - - ## Software versions - - - Bowtie 1.2.0 - - Samtools 1.4 - - Preseq 2.0 - - MACS2 2.1.1.20160309 - - Bedtools 2.26.0 - - UCSC userApps v358 - - ## Inputs - - | ID | Label | Description | Required | Default | Upstream analyses | - | ------------------------- | ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------: | ------- | ------------------------------- | - | **fastq\_file** | FASTQ file | Single-read sequencing data in FASTQ format (fastq, fq, bzip2, gzip, zip) | + | | | - | **indices\_folder** | Genome indices | Directory with the genome indices generated by Bowtie | + | | genome\_indices/bowtie\_indices | - | **annotation\_file** | Genome annotation file | Genome annotation file in TSV format | + | | genome\_indices/annotation | - | **genome\_size** | Effective genome size | The length of the mappable genome (hs, mm, ce, dm or number, for example 2.7e9) | + | | genome\_indices/genome\_size | - | **chrom\_length** | Chromosome lengths file | Chromosome lengths file in TSV format | + | | genome\_indices/chrom\_length | - | **broad\_peak** | Call broad peaks | Make MACS2 call broad peaks by linking nearby highly enriched regions | + | | | - | **control\_file** | Control ChIP-Seq single-read experiment | Indexed BAM file from the ChIP-Seq single-read experiment to be used as a control for MACS2 peak calling | | Null | control\_file/bambai\_pair | - | **exp\_fragment\_size** | Expected fragment size | Expected fragment size for read extenstion towards 3' end if *force\_fragment\_size* was set to True or if calculated by MACS2 fragment size was less that 80 bp | | 150 | | - | **force\_fragment\_size** | Force peak calling with expected fragment size | Make MACS2 don't build the shifting model and use expected fragment size for read extenstion towards 3' end | | False | | - | **clip\_3p\_end** | Clip from 3' end | Number of base pairs to clip from 3' end | | 0 | | - | **clip\_5p\_end** | Clip from 5' end | Number of base pairs to clip from 5' end | | 0 | | - | **remove\_duplicates** | Remove PCR duplicates | Remove PCR duplicates from sorted BAM file | | False | | - | **threads** | Number of threads | Number of threads for those steps that support multithreading | | 2 | | - - - ## Outputs - - | ID | Label | Description | Required | Visualization | - | ------------------------ | ---------------------------------- | ------------------------------------------------------------------------------------ | :------: | ------------------------------------------------------------------ | - | **fastx\_statistics** | FASTQ quality statistics | FASTQ quality statistics in TSV format | + | *Base Frequency* and *Quality Control* plots in *QC Plots* tab | - | **bambai\_pair** | Aligned reads | Coordinate sorted BAM alignment and index BAI files | + | *Nucleotide Sequence Alignments* track in *IGV Genome Browser* tab | - | **bigwig** | Genome coverage | Genome coverage in bigWig format | + | *Genome Coverage* track in *IGV Genome Browser* tab | - | **iaintersect\_result** | Gene annotated peaks | MACS2 peak file annotated with nearby genes | + | *Peak Coordinates* table in *Peak Calling* tab | - | **atdp\_result** | Average Tag Density Plot | Average Tag Density Plot file in TSV format | + | *Average Tag Density Plot* in *QC Plots* tab | - | **macs2\_called\_peaks** | Called peaks | Called peaks file with 1-based coordinates in XLS format | + | | - | **macs2\_narrow\_peaks** | Narrow peaks | Called peaks file in ENCODE narrow peak format | | *Narrow peaks* track in *IGV Genome Browser* tab | - | **macs2\_broad\_peaks** | Broad peaks | Called peaks file in ENCODE broad peak format | | *Broad peaks* track in *IGV Genome Browser* tab | - | **preseq\_estimates** | Expected Distinct Reads Count Plot | Expected distinct reads count file from Preseq in TSV format | | *Expected Distinct Reads Count Plot* in *QC Plots* tab | - | **workflow\_statistics** | Workflow execution statistics | Overall workflow execution statistics from bowtie\_aligner and samtools\_rmdup steps | + | *Overview* tab and experiment's preview | - | **bowtie\_log** | Read alignment log | Read alignment log file from Bowtie | + | | \ No newline at end of file diff --git a/workflows/rnaseq-pe-dutp-mitochondrial.cwl b/workflows/rnaseq-pe-dutp-mitochondrial.cwl deleted file mode 100644 index 8c5496fc..00000000 --- a/workflows/rnaseq-pe-dutp-mitochondrial.cwl +++ /dev/null @@ -1,634 +0,0 @@ -cwlVersion: v1.0 -class: Workflow - -requirements: - - class: SubworkflowFeatureRequirement - - class: StepInputExpressionRequirement - - class: MultipleInputFeatureRequirement - - class: InlineJavascriptRequirement - expressionLib: - - var get_root = function(basename) { - return basename.split('.').slice(0,1).join('.'); - }; - -'sd:metadata': -- "../metadata/rnaseq-header.cwl" - - -'sd:upstream': - genome_indices: "genome-indices.cwl" - -inputs: - -# General inputs - - star_indices_folder: - type: Directory - label: "STAR indices folder" - 'sd:upstreamSource': "genome_indices/star_indices" - doc: "Path to STAR generated indices" - - star_indices_folder_mitochondrial: - type: Directory - label: "STAR indices mitochondrial folder" - 'sd:upstreamSource': "genome_indices/mitochondrial_indices" - doc: "Path to STAR generated indices for mitochondrial dna" - - bowtie_indices_folder: - type: Directory - label: "BowTie Ribosomal Indices" - 'sd:upstreamSource': "genome_indices/ribosomal_indices" - doc: "Path to Bowtie generated indices" - - chrom_length_file: - type: File - label: "Chromosome length file" - format: "http://edamontology.org/format_2330" - 'sd:upstreamSource': "genome_indices/chrom_length" - doc: "Chromosome length file" - - annotation_file: - type: File - label: "Annotation file" - format: - - "http://edamontology.org/format_2306" - - "http://edamontology.org/format_3475" - 'sd:upstreamSource': "genome_indices/annotation" - doc: "GTF or TAB-separated annotation file" - - fastq_file_upstream: - type: File - label: "FASTQ 1 input file" - format: "http://edamontology.org/format_1930" - doc: "Reads data in a FASTQ format, received after paired end sequencing" - - fastq_file_downstream: - type: File - label: "FASTQ 2 input file" - format: "http://edamontology.org/format_1930" - doc: "Reads data in a FASTQ format, received after paired end sequencing" - -# Advanced inputs - - exclude_chr: - type: string? - 'sd:layout': - advanced: true - label: "Chromosome to be excluded in rpkm calculation" - doc: "Chromosome to be excluded in rpkm calculation" - - clip_3p_end: - type: int? - default: 0 - 'sd:layout': - advanced: true - label: "Clip from 3p end" - doc: "Number of bases to clip from the 3p end" - - clip_5p_end: - type: int? - default: 0 - 'sd:layout': - advanced: true - label: "Clip from 5p end" - doc: "Number of bases to clip from the 5p end" - -# System dependent - - threads: - type: int? - default: 2 - 'sd:layout': - advanced: true - label: "Number of threads" - doc: "Number of threads for those steps that support multithreading" - -outputs: - - bigwig_upstream: - type: File - format: "http://edamontology.org/format_3006" - label: "BigWig file" - doc: "Generated BigWig file for (+)strand reads" - outputSource: bam_to_bigwig_upstream/bigwig_file - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - type: 'wig' - name: "(+)strand BigWig" - height: 120 - - bigwig_downstream: - type: File - format: "http://edamontology.org/format_3006" - label: "BigWig file" - doc: "Generated BigWig file for (-)strand reads" - outputSource: bam_to_bigwig_downstream/bigwig_file - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - type: 'wig' - name: "(-)strand BigWig" - height: 120 - - star_final_log: - type: File - format: "http://edamontology.org/format_2330" - label: "STAR final log" - doc: "STAR Log.final.out" - outputSource: star_aligner/log_final - - star_out_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR log out" - doc: "STAR Log.out" - outputSource: star_aligner/log_out - - star_progress_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR progress log" - doc: "STAR Log.progress.out" - outputSource: star_aligner/log_progress - - star_stdout_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR stdout log" - doc: "STAR Log.std.out" - outputSource: star_aligner/log_std - - star_sj_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR sj log" - doc: "STAR SJ.out.tab" - outputSource: star_aligner/log_sj - - fastx_statistics_upstream: - type: File - format: "http://edamontology.org/format_2330" - label: "FASTQ 1 statistics" - doc: "fastx_quality_stats generated FASTQ 1 quality statistics file" - outputSource: fastx_quality_stats_upstream/statistics_file - 'sd:visualPlugins': - - line: - tab: 'QC Plots' - Title: 'FASTQ 1 Base frequency plot' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Frequency' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$13, $14, $15, $16, $17] - - boxplot: - tab: 'QC Plots' - Title: 'FASTQ 1 Quality Control' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Quality score' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$11, $7, $8, $9, $12] - - fastx_statistics_downstream: - type: File - format: "http://edamontology.org/format_2330" - label: "FASTQ 2 statistics" - doc: "fastx_quality_stats generated FASTQ 2 quality statistics file" - outputSource: fastx_quality_stats_downstream/statistics_file - 'sd:visualPlugins': - - line: - tab: 'QC Plots' - Title: 'FASTQ 2 Base frequency plot' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Frequency' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$13, $14, $15, $16, $17] - - boxplot: - tab: 'QC Plots' - Title: 'FASTQ 2 Quality Control' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Quality score' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$11, $7, $8, $9, $12] - - bam_merged_index: - type: File - format: "http://edamontology.org/format_2572" - label: "Coordinate sorted BAM alignment file (+index BAI)" - doc: "Coordinate sorted BAM file and BAI index file" - outputSource: merge_original_and_mitochondrial_index/bam_bai_pair - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - optional: true - type: 'alignment' - format: 'bam' - name: "BAM Track" - displayMode: "SQUISHED" - - bowtie_log: - type: File - format: "http://edamontology.org/format_2330" - label: "Bowtie alignment log" - doc: "Bowtie alignment log file" - outputSource: bowtie_aligner/log_file - - rpkm_isoforms: - type: File - format: "http://edamontology.org/format_3752" - label: "RPKM, grouped by isoforms" - doc: "Calculated rpkm values, grouped by isoforms" - outputSource: rpkm_calculation/isoforms_file - - rpkm_genes: - type: File - format: "http://edamontology.org/format_3475" - label: "RPKM, grouped by gene name" - doc: "Calculated rpkm values, grouped by gene name" - outputSource: group_isoforms/genes_file - 'sd:visualPlugins': - - syncfusiongrid: - tab: 'Gene Expression' - Title: 'RPKM, grouped by gene name' - - rpkm_common_tss: - type: File - format: "http://edamontology.org/format_3475" - label: "RPKM, grouped by common TSS" - doc: "Calculated rpkm values, grouped by common TSS" - outputSource: group_isoforms/common_tss_file - - htseq_count_gene_expression_file: - type: File - format: "http://edamontology.org/format_3475" - label: "HTSeq: read counts grouped by gene_id" - doc: "HTSeq: read counts grouped by gene_id" - outputSource: htseq_count_gene_expression/feature_counts_report_file - - htseq_count_stdout_log: - type: File - format: "http://edamontology.org/format_2330" - label: "HTSeq: stdout log" - doc: "HTSeq: stdout log" - outputSource: htseq_count_gene_expression/stdout_log - - htseq_count_stderr_log: - type: File - format: "http://edamontology.org/format_2330" - label: "HTSeq: stderr log" - doc: "HTSeq: stderr log" - outputSource: htseq_count_gene_expression/stderr_log - - get_stat_log: - type: File? - label: "YAML formatted combined log" - format: "http://edamontology.org/format_3750" - doc: "YAML formatted combined log" - outputSource: get_stat/collected_statistics_yaml - - get_stat_markdown: - type: File? - label: "Markdown formatted combined log" - format: "http://edamontology.org/format_3835" - doc: "Markdown formatted combined log" - outputSource: get_stat/collected_statistics_md - 'sd:visualPlugins': - - markdownView: - tab: 'Overview' - - get_formatted_stats: - type: File? - label: "Bowtie, STAR and GEEP mapping stats" - format: "http://edamontology.org/format_2330" - doc: "Processed and combined Bowtie & STAR aligner and GEEP logs" - outputSource: get_stat/collected_statistics_tsv - 'sd:visualPlugins': - - tableView: - vertical: true - tab: 'Overview' - 'sd:preview': - 'sd:visualPlugins': - - pie: - colors: ['#b3de69', '#99c0db', '#fdc381', '#fb8072'] - data: [$2, $3, $4, $5] - - bam_statistics_report: - type: File - label: "BAM statistics report" - format: "http://edamontology.org/format_2330" - doc: "BAM statistics report (right after alignment and sorting)" - outputSource: get_bam_statistics/log_file - - insert_size_report: - type: File - label: "Insert size distribution report" - format: "http://edamontology.org/format_3475" - doc: "Insert size distribution report (right after alignment and sorting)" - outputSource: get_bam_statistics/ext_is_section - 'sd:visualPlugins': - - scatter: - tab: 'QC Plots' - Title: 'Insert Size Distribution' - xAxisTitle: 'Insert size' - yAxisTitle: 'Pairs total' - colors: ["#4b78a3"] - height: 500 - data: [$1, $2] - comparable: "isdp" - - -steps: - - extract_fastq_upstream: - run: ../tools/extract-fastq.cwl - in: - output_prefix: - default: "read_1" - compressed_file: fastq_file_upstream - out: [fastq_file] - - extract_fastq_downstream: - run: ../tools/extract-fastq.cwl - in: - output_prefix: - default: "read_2" - compressed_file: fastq_file_downstream - out: [fastq_file] - - fastx_quality_stats_upstream: - run: ../tools/fastx-quality-stats.cwl - in: - input_file: extract_fastq_upstream/fastq_file - out: [statistics_file] - - fastx_quality_stats_downstream: - run: ../tools/fastx-quality-stats.cwl - in: - input_file: extract_fastq_downstream/fastq_file - out: [statistics_file] - - star_aligner: - run: ../tools/star-alignreads.cwl - in: - readFilesIn: [extract_fastq_upstream/fastq_file, extract_fastq_downstream/fastq_file] - genomeDir: star_indices_folder - outFilterMultimapNmax: - default: 1 - outFilterMismatchNmax: - default: 5 - alignSJDBoverhangMin: - default: 1 - seedSearchStartLmax: - default: 15 - outReadsUnmapped: - default: "Fastx" - clip3pNbases: clip_3p_end - clip5pNbases: clip_5p_end - threads: threads - out: - - aligned_file - - log_final - - unmapped_mate_1_file - - uniquely_mapped_reads_number - - log_out - - log_progress - - log_std - - log_sj - - star_aligner_mitochondrial: - run: ../tools/star-alignreads.cwl - in: - readFilesIn: star_aligner/unmapped_mate_1_file - genomeDir: star_indices_folder_mitochondrial - outFilterMultimapNmax: - default: 1 - outFilterMismatchNmax: - default: 5 - alignSJDBoverhangMin: - default: 1 - seedSearchStartLmax: - default: 15 - clip3pNbases: clip_3p_end - clip5pNbases: clip_5p_end - threads: threads - out: - - aligned_file - - log_final - - uniquely_mapped_reads_number - - log_out - - log_progress - - log_std - - log_sj - - samtools_sort_index_mitochondrial: - run: ../tools/samtools-sort-index.cwl - in: - sort_input: star_aligner_mitochondrial/aligned_file - sort_output_filename: - source: extract_fastq_upstream/fastq_file - valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'_mitochondrial.bam') - threads: threads - out: [bam_bai_pair] - - samtools_sort_index: - run: ../tools/samtools-sort-index.cwl - in: - sort_input: star_aligner/aligned_file - sort_output_filename: - source: extract_fastq_upstream/fastq_file - valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'_sorted.bam') - threads: threads - out: [bam_bai_pair] - - merge_original_and_mitochondrial: - run: ../tools/samtools-merge.cwl - in: - output_filename: - source: extract_fastq_upstream/fastq_file - valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'_merged.bam') - alignment_files: [ samtools_sort_index/bam_bai_pair, samtools_sort_index_mitochondrial/bam_bai_pair ] - out: [merged_alignment_file] - - merge_original_and_mitochondrial_index: - run: ../tools/samtools-sort-index.cwl - in: - sort_input: merge_original_and_mitochondrial/merged_alignment_file - sort_output_filename: - source: extract_fastq_upstream/fastq_file - valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'.bam') - threads: threads - out: [bam_bai_pair] - - bam_to_bigwig_upstream: - run: ../tools/bam-bedgraph-bigwig.cwl - in: - bam_file: merge_original_and_mitochondrial_index/bam_bai_pair - chrom_length_file: chrom_length_file - mapped_reads_number: star_aligner/uniquely_mapped_reads_number - bigwig_filename: - source: extract_fastq_upstream/fastq_file - valueFrom: | - ${ - var root = self.basename.split('.').slice(0,-1).join('.'); - var ext = "_upstream.bigWig"; - return (root == "")?self.basename+ext:root+ext; - } - strand: - default: '+' - out: [bigwig_file] - - bam_to_bigwig_downstream: - run: ../tools/bam-bedgraph-bigwig.cwl - in: - bam_file: merge_original_and_mitochondrial_index/bam_bai_pair - chrom_length_file: chrom_length_file - mapped_reads_number: - source: star_aligner/uniquely_mapped_reads_number - valueFrom: $(-self) - bigwig_filename: - source: extract_fastq_upstream/fastq_file - valueFrom: | - ${ - var root = self.basename.split('.').slice(0,-1).join('.'); - var ext = "_downstream.bigWig"; - return (root == "")?self.basename+ext:root+ext; - } - strand: - default: '-' - out: [bigwig_file] - - bowtie_aligner: - run: ../tools/bowtie-alignreads.cwl - in: - upstream_filelist: extract_fastq_upstream/fastq_file - downstream_filelist: extract_fastq_downstream/fastq_file - indices_folder: bowtie_indices_folder - clip_3p_end: clip_3p_end - clip_5p_end: clip_5p_end - v: - default: 3 - m: - default: 1 - sam: - default: true - threads: threads - out: [log_file] - - rpkm_calculation: - run: ../tools/geep.cwl - in: - bam_file: merge_original_and_mitochondrial_index/bam_bai_pair - annotation_file: annotation_file - dutp: - default: true - rpkm_threshold: - default: 0.001 - exclude_chr: exclude_chr - threads: threads - out: [isoforms_file] - - group_isoforms: - run: ../tools/group-isoforms.cwl - in: - isoforms_file: rpkm_calculation/isoforms_file - out: - - genes_file - - common_tss_file - - get_annotation_gtf: - run: ../tools/ucsc-genepredtogtf.cwl - in: - annotation_tsv_file: annotation_file - out: - - annotation_gtf_file - - htseq_count_gene_expression: - run: ../tools/htseq-count.cwl - in: - alignment_bam_file: merge_original_and_mitochondrial_index/bam_bai_pair - annotation_gtf_file: get_annotation_gtf/annotation_gtf_file - strand_specific: - default: "reverse" - feature_type: - default: "exon" - feature_id: - default: "gene_id" - out: - - feature_counts_report_file - - stdout_log - - stderr_log - - get_bam_statistics: - run: ../tools/samtools-stats.cwl - in: - bambai_pair: samtools_sort_index/bam_bai_pair - output_filename: - source: samtools_sort_index/bam_bai_pair - valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") - out: [log_file, ext_is_section] - - get_stat: - run: ../tools/collect-statistics-rna-seq.cwl - in: - star_alignment_report: star_aligner/log_final - bowtie_alignment_report: bowtie_aligner/log_file - bam_statistics_report: get_bam_statistics/log_file - isoforms_file: rpkm_calculation/isoforms_file - paired_end: - default: true - out: [collected_statistics_yaml, collected_statistics_tsv, collected_statistics_md] - - -$namespaces: - s: http://schema.org/ - -$schemas: -- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf - -s:name: "RNA-Seq pipeline paired-end stranded mitochondrial" -label: "RNA-Seq pipeline paired-end stranded mitochondrial" -s:alternateName: "RNA-Seq strand specific mitochondrial workflow for pair-end experiment based on BioWardrobe's basic analysis" - -s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/rnaseq-pe-dutp-mitochondrial.cwl -s:codeRepository: https://github.com/datirium/workflows -s:license: http://www.apache.org/licenses/LICENSE-2.0 - -s:isPartOf: - class: s:CreativeWork - s:name: Common Workflow Language - s:url: http://commonwl.org/ - -s:creator: -- class: s:Organization - s:legalName: "Datirium, LLC" - s:member: - - class: s:Person - s:name: Artem BArski - s:email: mailto:Artem.Barski@datirum.com - - class: s:Person - s:name: Andrey Kartashov - s:email: mailto:Andrey.Kartashov@datirium.com - s:sameAs: - - id: http://orcid.org/0000-0001-9102-5681 - - -# doc: -# $include: ../descriptions/rnaseq-pe-dutp-mitochondrial.md - - -doc: | - Slightly changed original [BioWardrobe's](https://biowardrobe.com) [PubMed ID:26248465](https://www.ncbi.nlm.nih.gov/pubmed/26248465) - **RNA-Seq** basic analysis for **strand specific pair-end** experiment. - An additional steps were added to map data to mitochondrial chromosome only and then merge the output. - - Experiment files in [FASTQ](http://maq.sourceforge.net/fastq.shtml) format either compressed or not can be used. - - Current workflow should be used only with the pair-end strand specific RNA-Seq data. It performs the following steps: - 1. `STAR` to align reads from input FASTQ file according to the predefined reference indices; generate unsorted BAM file and alignment statistics file - 2. `fastx_quality_stats` to analyze input FASTQ file and generate quality statistics file - 3. `samtools sort` to generate coordinate sorted BAM(+BAI) file pair from the unsorted BAM file obtained on the step 1 (after running STAR) - 5. Generate BigWig file on the base of sorted BAM file - 6. Map input FASTQ file to predefined rRNA reference indices using Bowtie to define the level of rRNA contamination; export resulted statistics to file - 7. Calculate isoform expression level for the sorted BAM file and GTF/TAB annotation file using `GEEP` reads-counting utility; export results to file \ No newline at end of file diff --git a/workflows/rnaseq-pe-dutp.cwl b/workflows/rnaseq-pe-dutp.cwl deleted file mode 100644 index ed05ada3..00000000 --- a/workflows/rnaseq-pe-dutp.cwl +++ /dev/null @@ -1,591 +0,0 @@ -cwlVersion: v1.0 -class: Workflow - -requirements: - - class: SubworkflowFeatureRequirement - - class: StepInputExpressionRequirement - - class: MultipleInputFeatureRequirement - - class: InlineJavascriptRequirement - expressionLib: - - var get_root = function(basename) { - return basename.split('.').slice(0,1).join('.'); - }; - -'sd:metadata': - - "../metadata/rnaseq-header.cwl" - -'sd:upstream': - genome_indices: "genome-indices.cwl" - -inputs: - -# General inputs - - star_indices_folder: - type: Directory - label: "STAR indices folder" - 'sd:upstreamSource': "genome_indices/star_indices" - doc: "Path to STAR generated indices" - - bowtie_indices_folder: - type: Directory - label: "BowTie Ribosomal Indices" - 'sd:upstreamSource': "genome_indices/ribosomal_indices" - doc: "Path to Bowtie generated indices" - - chrom_length_file: - type: File - label: "Chromosome length file" - format: "http://edamontology.org/format_2330" - 'sd:upstreamSource': "genome_indices/chrom_length" - doc: "Chromosome length file" - - annotation_file: - type: File - label: "Annotation file" - format: - - "http://edamontology.org/format_2306" - - "http://edamontology.org/format_3475" - 'sd:upstreamSource': "genome_indices/annotation" - doc: "GTF or TAB-separated annotation file" - - fastq_file_upstream: - type: File - label: "FASTQ 1 input file" - format: "http://edamontology.org/format_1930" - doc: "Reads data in a FASTQ format, received after paired end sequencing" - - fastq_file_downstream: - type: File - label: "FASTQ 2 input file" - format: "http://edamontology.org/format_1930" - doc: "Reads data in a FASTQ format, received after paired end sequencing" - -# Advanced inputs - - exclude_chr: - type: string? - 'sd:layout': - advanced: true - label: "Chromosome to be excluded in rpkm calculation" - doc: "Chromosome to be excluded in rpkm calculation" - - clip_3p_end: - type: int? - default: 0 - 'sd:layout': - advanced: true - label: "Clip from 3p end" - doc: "Number of bases to clip from the 3p end" - - clip_5p_end: - type: int? - default: 0 - 'sd:layout': - advanced: true - label: "Clip from 5p end" - doc: "Number of bases to clip from the 5p end" - -# System dependent - - threads: - type: int? - default: 2 - 'sd:layout': - advanced: true - label: "Number of threads" - doc: "Number of threads for those steps that support multithreading" - -outputs: - - bigwig_upstream: - type: File - format: "http://edamontology.org/format_3006" - label: "BigWig file" - doc: "Generated BigWig file for (+)strand reads" - outputSource: bam_to_bigwig_upstream/bigwig_file - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - type: 'wig' - name: "(+)strand BigWig" - height: 120 - - bigwig_downstream: - type: File - format: "http://edamontology.org/format_3006" - label: "BigWig file" - doc: "Generated BigWig file for (-)strand reads" - outputSource: bam_to_bigwig_downstream/bigwig_file - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - type: 'wig' - name: "(-)strand BigWig" - height: 120 - - star_final_log: - type: File - format: "http://edamontology.org/format_2330" - label: "STAR final log" - doc: "STAR Log.final.out" - outputSource: star_aligner/log_final - - star_out_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR log out" - doc: "STAR Log.out" - outputSource: star_aligner/log_out - - star_progress_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR progress log" - doc: "STAR Log.progress.out" - outputSource: star_aligner/log_progress - - star_stdout_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR stdout log" - doc: "STAR Log.std.out" - outputSource: star_aligner/log_std - - star_sj_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR sj log" - doc: "STAR SJ.out.tab" - outputSource: star_aligner/log_sj - - fastx_statistics_upstream: - type: File - format: "http://edamontology.org/format_2330" - label: "FASTQ 1 statistics" - doc: "fastx_quality_stats generated FASTQ 1 quality statistics file" - outputSource: fastx_quality_stats_upstream/statistics_file - 'sd:visualPlugins': - - line: - tab: 'QC Plots' - Title: 'FASTQ 1 Base frequency plot' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Frequency' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$13, $14, $15, $16, $17] - - boxplot: - tab: 'QC Plots' - Title: 'FASTQ 1 Quality Control' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Quality score' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$11, $7, $8, $9, $12] - - fastx_statistics_downstream: - type: File - format: "http://edamontology.org/format_2330" - label: "FASTQ 2 statistics" - doc: "fastx_quality_stats generated FASTQ 2 quality statistics file" - outputSource: fastx_quality_stats_downstream/statistics_file - 'sd:visualPlugins': - - line: - tab: 'QC Plots' - Title: 'FASTQ 2 Base frequency plot' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Frequency' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$13, $14, $15, $16, $17] - - boxplot: - tab: 'QC Plots' - Title: 'FASTQ 2 Quality Control' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Quality score' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$11, $7, $8, $9, $12] - - bambai_pair: - type: File - format: "http://edamontology.org/format_2572" - label: "Coordinate sorted BAM alignment file (+index BAI)" - doc: "Coordinate sorted BAM file and BAI index file" - outputSource: samtools_sort_index/bam_bai_pair - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - optional: true - type: 'alignment' - format: 'bam' - name: "BAM Track" - displayMode: "SQUISHED" - - bowtie_log: - type: File - format: "http://edamontology.org/format_2330" - label: "Bowtie alignment log" - doc: "Bowtie alignment log file" - outputSource: bowtie_aligner/log_file - - rpkm_isoforms: - type: File - format: "http://edamontology.org/format_3752" - label: "RPKM, grouped by isoforms" - doc: "Calculated rpkm values, grouped by isoforms" - outputSource: rpkm_calculation/isoforms_file - - rpkm_genes: - type: File - format: "http://edamontology.org/format_3475" - label: "RPKM, grouped by gene name" - doc: "Calculated rpkm values, grouped by gene name" - outputSource: group_isoforms/genes_file - 'sd:visualPlugins': - - syncfusiongrid: - tab: 'Gene Expression' - Title: 'RPKM, grouped by gene name' - - rpkm_common_tss: - type: File - format: "http://edamontology.org/format_3475" - label: "RPKM, grouped by common TSS" - doc: "Calculated rpkm values, grouped by common TSS" - outputSource: group_isoforms/common_tss_file - - htseq_count_gene_expression_file: - type: File - format: "http://edamontology.org/format_3475" - label: "HTSeq: read counts grouped by gene_id" - doc: "HTSeq: read counts grouped by gene_id" - outputSource: htseq_count_gene_expression/feature_counts_report_file - - htseq_count_stdout_log: - type: File - format: "http://edamontology.org/format_2330" - label: "HTSeq: stdout log" - doc: "HTSeq: stdout log" - outputSource: htseq_count_gene_expression/stdout_log - - htseq_count_stderr_log: - type: File - format: "http://edamontology.org/format_2330" - label: "HTSeq: stderr log" - doc: "HTSeq: stderr log" - outputSource: htseq_count_gene_expression/stderr_log - - get_stat_log: - type: File? - label: "YAML formatted combined log" - format: "http://edamontology.org/format_3750" - doc: "YAML formatted combined log" - outputSource: get_stat/collected_statistics_yaml - - get_stat_markdown: - type: File? - label: "Markdown formatted combined log" - format: "http://edamontology.org/format_3835" - doc: "Markdown formatted combined log" - outputSource: get_stat/collected_statistics_md - 'sd:visualPlugins': - - markdownView: - tab: 'Overview' - - get_formatted_stats: - type: File? - label: "Bowtie, STAR and GEEP mapping stats" - format: "http://edamontology.org/format_2330" - doc: "Processed and combined Bowtie & STAR aligner and GEEP logs" - outputSource: get_stat/collected_statistics_tsv - 'sd:visualPlugins': - - tableView: - vertical: true - tab: 'Overview' - 'sd:preview': - 'sd:visualPlugins': - - pie: - colors: ['#b3de69', '#99c0db', '#fdc381', '#fb8072'] - data: [$2, $3, $4, $5] - - bam_statistics_report: - type: File - label: "BAM statistics report" - format: "http://edamontology.org/format_2330" - doc: "BAM statistics report (right after alignment and sorting)" - outputSource: get_bam_statistics/log_file - - insert_size_report: - type: File - label: "Insert size distribution report" - format: "http://edamontology.org/format_3475" - doc: "Insert size distribution report (right after alignment and sorting)" - outputSource: get_bam_statistics/ext_is_section - 'sd:visualPlugins': - - scatter: - tab: 'QC Plots' - Title: 'Insert Size Distribution' - xAxisTitle: 'Insert size' - yAxisTitle: 'Pairs total' - colors: ["#4b78a3"] - height: 500 - data: [$1, $2] - comparable: "isdp" - - -steps: - - extract_fastq_upstream: - run: ../tools/extract-fastq.cwl - in: - output_prefix: - default: "read_1" - compressed_file: fastq_file_upstream - out: [fastq_file] - - extract_fastq_downstream: - run: ../tools/extract-fastq.cwl - in: - output_prefix: - default: "read_2" - compressed_file: fastq_file_downstream - out: [fastq_file] - - star_aligner: - run: ../tools/star-alignreads.cwl - in: - readFilesIn: [extract_fastq_upstream/fastq_file, extract_fastq_downstream/fastq_file] - genomeDir: star_indices_folder - outFilterMultimapNmax: - default: 1 - outFilterMismatchNmax: - default: 5 - alignSJDBoverhangMin: - default: 1 - seedSearchStartLmax: - default: 15 - clip3pNbases: clip_3p_end - clip5pNbases: clip_5p_end - threads: threads - out: - - aligned_file - - log_final - - uniquely_mapped_reads_number - - log_out - - log_progress - - log_std - - log_sj - - fastx_quality_stats_upstream: - run: ../tools/fastx-quality-stats.cwl - in: - input_file: extract_fastq_upstream/fastq_file - out: [statistics_file] - - fastx_quality_stats_downstream: - run: ../tools/fastx-quality-stats.cwl - in: - input_file: extract_fastq_downstream/fastq_file - out: [statistics_file] - - samtools_sort_index: - run: ../tools/samtools-sort-index.cwl - in: - sort_input: star_aligner/aligned_file - sort_output_filename: - source: extract_fastq_upstream/fastq_file - valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'.bam') - threads: threads - out: [bam_bai_pair] - - bam_to_bigwig_upstream: - run: ../tools/bam-bedgraph-bigwig.cwl - in: - bam_file: samtools_sort_index/bam_bai_pair - chrom_length_file: chrom_length_file - mapped_reads_number: - source: star_aligner/uniquely_mapped_reads_number - valueFrom: $(self*2) - bigwig_filename: - source: samtools_sort_index/bam_bai_pair - valueFrom: | - ${ - let root = self.basename.split('.').slice(0,-1).join('.'); - let ext = "_upstream.bigWig"; - return (root == "")?self.basename+ext:root+ext; - } - strand: - default: '+' - dutp: - default: true - out: [bigwig_file] - - bam_to_bigwig_downstream: - run: ../tools/bam-bedgraph-bigwig.cwl - in: - bam_file: samtools_sort_index/bam_bai_pair - chrom_length_file: chrom_length_file - mapped_reads_number: - source: star_aligner/uniquely_mapped_reads_number - valueFrom: $(-self*2) - bigwig_filename: - source: samtools_sort_index/bam_bai_pair - valueFrom: | - ${ - let root = self.basename.split('.').slice(0,-1).join('.'); - let ext = "_downstream.bigWig"; - return (root == "")?self.basename+ext:root+ext; - } - strand: - default: '-' - dutp: - default: true - out: [bigwig_file] - - bowtie_aligner: - run: ../tools/bowtie-alignreads.cwl - in: - upstream_filelist: extract_fastq_upstream/fastq_file - downstream_filelist: extract_fastq_downstream/fastq_file - indices_folder: bowtie_indices_folder - clip_3p_end: clip_3p_end - clip_5p_end: clip_5p_end - v: - default: 3 - m: - default: 1 - sam: - default: true - threads: threads - out: [log_file] - - rpkm_calculation: - run: ../tools/geep.cwl - in: - bam_file: samtools_sort_index/bam_bai_pair - annotation_file: annotation_file - dutp: - default: true - rpkm_threshold: - default: 0.001 - exclude_chr: exclude_chr - threads: threads - out: [isoforms_file] - - group_isoforms: - run: ../tools/group-isoforms.cwl - in: - isoforms_file: rpkm_calculation/isoforms_file - out: - - genes_file - - common_tss_file - - get_annotation_gtf: - run: ../tools/ucsc-genepredtogtf.cwl - in: - annotation_tsv_file: annotation_file - out: - - annotation_gtf_file - - htseq_count_gene_expression: - run: ../tools/htseq-count.cwl - in: - alignment_bam_file: samtools_sort_index/bam_bai_pair - annotation_gtf_file: get_annotation_gtf/annotation_gtf_file - strand_specific: - default: "reverse" - feature_type: - default: "exon" - feature_id: - default: "gene_id" - out: - - feature_counts_report_file - - stdout_log - - stderr_log - - get_bam_statistics: - run: ../tools/samtools-stats.cwl - in: - bambai_pair: samtools_sort_index/bam_bai_pair - output_filename: - source: samtools_sort_index/bam_bai_pair - valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") - out: [log_file, ext_is_section] - - get_stat: - run: ../tools/collect-statistics-rna-seq.cwl - in: - star_alignment_report: star_aligner/log_final - bowtie_alignment_report: bowtie_aligner/log_file - bam_statistics_report: get_bam_statistics/log_file - isoforms_file: rpkm_calculation/isoforms_file - paired_end: - default: true - out: [collected_statistics_yaml, collected_statistics_tsv, collected_statistics_md] - - -$namespaces: - s: http://schema.org/ - -$schemas: -- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf - -s:name: "Deprecated. RNA-Seq pipeline paired-end strand specific" -label: "Deprecated. RNA-Seq pipeline paired-end strand specific" -s:alternateName: "RNA-Seq basic analysis workflow for strand specific paired-end experiment" - -s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/rnaseq-pe-dutp.cwl -s:codeRepository: https://github.com/datirium/workflows -s:license: http://www.apache.org/licenses/LICENSE-2.0 - -s:isPartOf: - class: s:CreativeWork - s:name: Common Workflow Language - s:url: http://commonwl.org/ - -s:creator: -- class: s:Organization - s:legalName: "Cincinnati Children's Hospital Medical Center" - s:location: - - class: s:PostalAddress - s:addressCountry: "USA" - s:addressLocality: "Cincinnati" - s:addressRegion: "OH" - s:postalCode: "45229" - s:streetAddress: "3333 Burnet Ave" - s:telephone: "+1(513)636-4200" - s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" - s:department: - - class: s:Organization - s:legalName: "Allergy and Immunology" - s:department: - - class: s:Organization - s:legalName: "Barski Research Lab" - s:member: - - class: s:Person - s:name: Michael Kotliar - s:email: mailto:misha.kotliar@gmail.com - s:sameAs: - - id: http://orcid.org/0000-0002-6486-3898 - - class: s:Person - s:name: Andrey Kartashov - s:email: mailto:Andrey.Kartashov@cchmc.org - s:sameAs: - - id: http://orcid.org/0000-0001-9102-5681 - - -# doc: -# $include: ../descriptions/rnaseq-pe-dutp.md - - -doc: | - The original [BioWardrobe's](https://biowardrobe.com) [PubMed ID:26248465](https://www.ncbi.nlm.nih.gov/pubmed/26248465) - **RNA-Seq** basic analysis for a **paired-end** experiment. - A corresponded input [FASTQ](http://maq.sourceforge.net/fastq.shtml) file has to be provided. - - Current workflow should be used only with the paired-end RNA-Seq data. It performs the following steps: - 1. Use STAR to align reads from input FASTQ files according to the predefined reference indices; generate unsorted BAM file and alignment statistics file - 2. Use fastx_quality_stats to analyze input FASTQ files and generate quality statistics files - 3. Use samtools sort to generate coordinate sorted BAM(+BAI) file pair from the unsorted BAM file obtained on the step 1 (after running STAR) - 4. Generate BigWig file on the base of sorted BAM file - 5. Map input FASTQ files to predefined rRNA reference indices using Bowtie to define the level of rRNA contamination; export resulted statistics to file - 6. Calculate isoform expression level for the sorted BAM file and GTF/TAB annotation file using GEEP reads-counting utility; export results to file \ No newline at end of file diff --git a/workflows/rnaseq-pe.cwl b/workflows/rnaseq-pe.cwl deleted file mode 100644 index 3e780e30..00000000 --- a/workflows/rnaseq-pe.cwl +++ /dev/null @@ -1,544 +0,0 @@ -cwlVersion: v1.0 -class: Workflow - - -requirements: - - class: SubworkflowFeatureRequirement - - class: StepInputExpressionRequirement - - class: MultipleInputFeatureRequirement - - class: InlineJavascriptRequirement - expressionLib: - - var get_root = function(basename) { - return basename.split('.').slice(0,1).join('.'); - }; - - -'sd:metadata': - - "../metadata/rnaseq-header.cwl" - -'sd:upstream': - genome_indices: "genome-indices.cwl" - - -inputs: - -# General inputs - - star_indices_folder: - type: Directory - label: "STAR indices folder" - 'sd:upstreamSource': "genome_indices/star_indices" - doc: "Path to STAR generated indices" - - bowtie_indices_folder: - type: Directory - label: "BowTie Ribosomal Indices" - 'sd:upstreamSource': "genome_indices/ribosomal_indices" - doc: "Path to Bowtie generated indices" - - chrom_length_file: - type: File - label: "Chromosome length file" - format: "http://edamontology.org/format_2330" - 'sd:upstreamSource': "genome_indices/chrom_length" - doc: "Chromosome length file" - - annotation_file: - type: File - label: "Annotation file" - format: - - "http://edamontology.org/format_2306" - - "http://edamontology.org/format_3475" - 'sd:upstreamSource': "genome_indices/annotation" - doc: "GTF or TAB-separated annotation file" - - fastq_file_upstream: - type: File - label: "FASTQ 1 input file" - format: "http://edamontology.org/format_1930" - doc: "Reads data in a FASTQ format, received after paired end sequencing" - - fastq_file_downstream: - type: File - label: "FASTQ 2 input file" - format: "http://edamontology.org/format_1930" - doc: "Reads data in a FASTQ format, received after paired end sequencing" - -# Advanced inputs - - exclude_chr: - type: string? - 'sd:layout': - advanced: true - label: "Chromosome to be excluded in rpkm calculation" - doc: "Chromosome to be excluded in rpkm calculation" - - clip_3p_end: - type: int? - default: 0 - 'sd:layout': - advanced: true - label: "Clip from 3p end" - doc: "Number of bases to clip from the 3p end" - - clip_5p_end: - type: int? - default: 0 - 'sd:layout': - advanced: true - label: "Clip from 5p end" - doc: "Number of bases to clip from the 5p end" - -# System dependent - - threads: - type: int? - default: 2 - 'sd:layout': - advanced: true - label: "Number of threads" - doc: "Number of threads for those steps that support multithreading" - - -outputs: - - bigwig: - type: File - format: "http://edamontology.org/format_3006" - label: "BigWig file" - doc: "Generated BigWig file" - outputSource: bam_to_bigwig/bigwig_file - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - type: 'wig' - name: "BigWig Track" - height: 120 - - star_final_log: - type: File - format: "http://edamontology.org/format_2330" - label: "STAR final log" - doc: "STAR Log.final.out" - outputSource: star_aligner/log_final - - star_out_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR log out" - doc: "STAR Log.out" - outputSource: star_aligner/log_out - - star_progress_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR progress log" - doc: "STAR Log.progress.out" - outputSource: star_aligner/log_progress - - star_stdout_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR stdout log" - doc: "STAR Log.std.out" - outputSource: star_aligner/log_std - - star_sj_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR sj log" - doc: "STAR SJ.out.tab" - outputSource: star_aligner/log_sj - - fastx_statistics_upstream: - type: File - format: "http://edamontology.org/format_2330" - label: "FASTQ 1 statistics" - doc: "fastx_quality_stats generated FASTQ 1 quality statistics file" - outputSource: fastx_quality_stats_upstream/statistics_file - 'sd:visualPlugins': - - line: - tab: 'QC Plots' - Title: 'FASTQ 1 Base frequency plot' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Frequency' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$13, $14, $15, $16, $17] - - boxplot: - tab: 'QC Plots' - Title: 'FASTQ 1 Quality Control' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Quality score' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$11, $7, $8, $9, $12] - - fastx_statistics_downstream: - type: File - format: "http://edamontology.org/format_2330" - label: "FASTQ 2 statistics" - doc: "fastx_quality_stats generated FASTQ 2 quality statistics file" - outputSource: fastx_quality_stats_downstream/statistics_file - 'sd:visualPlugins': - - line: - tab: 'QC Plots' - Title: 'FASTQ 2 Base frequency plot' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Frequency' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$13, $14, $15, $16, $17] - - boxplot: - tab: 'QC Plots' - Title: 'FASTQ 2 Quality Control' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Quality score' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$11, $7, $8, $9, $12] - - bambai_pair: - type: File - format: "http://edamontology.org/format_2572" - label: "Coordinate sorted BAM alignment file (+index BAI)" - doc: "Coordinate sorted BAM file and BAI index file" - outputSource: samtools_sort_index/bam_bai_pair - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - optional: true - type: 'alignment' - format: 'bam' - name: "BAM Track" - displayMode: "SQUISHED" - - bowtie_log: - type: File - format: "http://edamontology.org/format_2330" - label: "Bowtie alignment log" - doc: "Bowtie alignment log file" - outputSource: bowtie_aligner/log_file - - rpkm_isoforms: - type: File - format: "http://edamontology.org/format_3752" - label: "RPKM, grouped by isoforms" - doc: "Calculated rpkm values, grouped by isoforms" - outputSource: rpkm_calculation/isoforms_file - - rpkm_genes: - type: File - format: "http://edamontology.org/format_3475" - label: "RPKM, grouped by gene name" - doc: "Calculated rpkm values, grouped by gene name" - outputSource: group_isoforms/genes_file - 'sd:visualPlugins': - - syncfusiongrid: - tab: 'Gene Expression' - Title: 'RPKM, grouped by gene name' - - rpkm_common_tss: - type: File - format: "http://edamontology.org/format_3475" - label: "RPKM, grouped by common TSS" - doc: "Calculated rpkm values, grouped by common TSS" - outputSource: group_isoforms/common_tss_file - - htseq_count_gene_expression_file: - type: File - format: "http://edamontology.org/format_3475" - label: "HTSeq: read counts grouped by gene_id" - doc: "HTSeq: read counts grouped by gene_id" - outputSource: htseq_count_gene_expression/feature_counts_report_file - - htseq_count_stdout_log: - type: File - format: "http://edamontology.org/format_2330" - label: "HTSeq: stdout log" - doc: "HTSeq: stdout log" - outputSource: htseq_count_gene_expression/stdout_log - - htseq_count_stderr_log: - type: File - format: "http://edamontology.org/format_2330" - label: "HTSeq: stderr log" - doc: "HTSeq: stderr log" - outputSource: htseq_count_gene_expression/stderr_log - - get_stat_log: - type: File? - label: "YAML formatted combined log" - format: "http://edamontology.org/format_3750" - doc: "YAML formatted combined log" - outputSource: get_stat/collected_statistics_yaml - - get_stat_markdown: - type: File? - label: "Markdown formatted combined log" - format: "http://edamontology.org/format_3835" - doc: "Markdown formatted combined log" - outputSource: get_stat/collected_statistics_md - 'sd:visualPlugins': - - markdownView: - tab: 'Overview' - - get_formatted_stats: - type: File? - label: "Bowtie, STAR and GEEP mapping stats" - format: "http://edamontology.org/format_2330" - doc: "Processed and combined Bowtie & STAR aligner and GEEP logs" - outputSource: get_stat/collected_statistics_tsv - 'sd:visualPlugins': - - tableView: - vertical: true - tab: 'Overview' - 'sd:preview': - 'sd:visualPlugins': - - pie: - colors: ['#b3de69', '#99c0db', '#fdc381', '#fb8072'] - data: [$2, $3, $4, $5] - - bam_statistics_report: - type: File - label: "BAM statistics report" - format: "http://edamontology.org/format_2330" - doc: "BAM statistics report (right after alignment and sorting)" - outputSource: get_bam_statistics/log_file - - insert_size_report: - type: File - label: "Insert size distribution report" - format: "http://edamontology.org/format_3475" - doc: "Insert size distribution report (right after alignment and sorting)" - outputSource: get_bam_statistics/ext_is_section - 'sd:visualPlugins': - - scatter: - tab: 'QC Plots' - Title: 'Insert Size Distribution' - xAxisTitle: 'Insert size' - yAxisTitle: 'Pairs total' - colors: ["#4b78a3"] - height: 500 - data: [$1, $2] - comparable: "isdp" - -steps: - - extract_fastq_upstream: - run: ../tools/extract-fastq.cwl - in: - output_prefix: - default: "read_1" - compressed_file: fastq_file_upstream - out: [fastq_file] - - extract_fastq_downstream: - run: ../tools/extract-fastq.cwl - in: - output_prefix: - default: "read_2" - compressed_file: fastq_file_downstream - out: [fastq_file] - - star_aligner: - run: ../tools/star-alignreads.cwl - in: - readFilesIn: [extract_fastq_upstream/fastq_file, extract_fastq_downstream/fastq_file] - genomeDir: star_indices_folder - outFilterMultimapNmax: - default: 1 - outFilterMismatchNmax: - default: 5 - alignSJDBoverhangMin: - default: 1 - seedSearchStartLmax: - default: 15 - clip3pNbases: clip_3p_end - clip5pNbases: clip_5p_end - threads: threads - out: - - aligned_file - - log_final - - uniquely_mapped_reads_number - - log_out - - log_progress - - log_std - - log_sj - - fastx_quality_stats_upstream: - run: ../tools/fastx-quality-stats.cwl - in: - input_file: extract_fastq_upstream/fastq_file - out: [statistics_file] - - fastx_quality_stats_downstream: - run: ../tools/fastx-quality-stats.cwl - in: - input_file: extract_fastq_downstream/fastq_file - out: [statistics_file] - - samtools_sort_index: - run: ../tools/samtools-sort-index.cwl - in: - sort_input: star_aligner/aligned_file - sort_output_filename: - source: extract_fastq_upstream/fastq_file - valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'.bam') - threads: threads - out: [bam_bai_pair] - - bam_to_bigwig: - run: ../tools/bam-bedgraph-bigwig.cwl - in: - bam_file: samtools_sort_index/bam_bai_pair - chrom_length_file: chrom_length_file - mapped_reads_number: - source: star_aligner/uniquely_mapped_reads_number - valueFrom: $(self*2) - out: [bigwig_file] - - bowtie_aligner: - run: ../tools/bowtie-alignreads.cwl - in: - upstream_filelist: extract_fastq_upstream/fastq_file - downstream_filelist: extract_fastq_downstream/fastq_file - indices_folder: bowtie_indices_folder - clip_3p_end: clip_3p_end - clip_5p_end: clip_5p_end - v: - default: 3 - m: - default: 1 - sam: - default: true - threads: threads - out: [log_file] - - rpkm_calculation: - run: ../tools/geep.cwl - in: - bam_file: samtools_sort_index/bam_bai_pair - annotation_file: annotation_file - rpkm_threshold: - default: 0.001 - exclude_chr: exclude_chr - threads: threads - out: [isoforms_file] - - group_isoforms: - run: ../tools/group-isoforms.cwl - in: - isoforms_file: rpkm_calculation/isoforms_file - out: - - genes_file - - common_tss_file - - get_annotation_gtf: - run: ../tools/ucsc-genepredtogtf.cwl - in: - annotation_tsv_file: annotation_file - out: - - annotation_gtf_file - - htseq_count_gene_expression: - run: ../tools/htseq-count.cwl - in: - alignment_bam_file: samtools_sort_index/bam_bai_pair - annotation_gtf_file: get_annotation_gtf/annotation_gtf_file - strand_specific: - default: "no" - feature_type: - default: "exon" - feature_id: - default: "gene_id" - out: - - feature_counts_report_file - - stdout_log - - stderr_log - - get_bam_statistics: - run: ../tools/samtools-stats.cwl - in: - bambai_pair: samtools_sort_index/bam_bai_pair - output_filename: - source: samtools_sort_index/bam_bai_pair - valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") - out: [log_file, ext_is_section] - - get_stat: - run: ../tools/collect-statistics-rna-seq.cwl - in: - star_alignment_report: star_aligner/log_final - bowtie_alignment_report: bowtie_aligner/log_file - bam_statistics_report: get_bam_statistics/log_file - isoforms_file: rpkm_calculation/isoforms_file - paired_end: - default: true - out: [collected_statistics_yaml, collected_statistics_tsv, collected_statistics_md] - - -$namespaces: - s: http://schema.org/ - -$schemas: -- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf - -s:name: "Deprecated. RNA-Seq pipeline paired-end" -label: "Deprecated. RNA-Seq pipeline paired-end" -s:alternateName: "RNA-Seq basic analysis workflow for paired-end experiment" - -s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/rnaseq-pe.cwl -s:codeRepository: https://github.com/datirium/workflows -s:license: http://www.apache.org/licenses/LICENSE-2.0 - -s:isPartOf: - class: s:CreativeWork - s:name: Common Workflow Language - s:url: http://commonwl.org/ - -s:creator: -- class: s:Organization - s:legalName: "Cincinnati Children's Hospital Medical Center" - s:location: - - class: s:PostalAddress - s:addressCountry: "USA" - s:addressLocality: "Cincinnati" - s:addressRegion: "OH" - s:postalCode: "45229" - s:streetAddress: "3333 Burnet Ave" - s:telephone: "+1(513)636-4200" - s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" - s:department: - - class: s:Organization - s:legalName: "Allergy and Immunology" - s:department: - - class: s:Organization - s:legalName: "Barski Research Lab" - s:member: - - class: s:Person - s:name: Michael Kotliar - s:email: mailto:misha.kotliar@gmail.com - s:sameAs: - - id: http://orcid.org/0000-0002-6486-3898 - - class: s:Person - s:name: Andrey Kartashov - s:email: mailto:Andrey.Kartashov@cchmc.org - s:sameAs: - - id: http://orcid.org/0000-0001-9102-5681 - - -# doc: -# $include: ../descriptions/rnaseq-pe.md - - -doc: | - The original [BioWardrobe's](https://biowardrobe.com) [PubMed ID:26248465](https://www.ncbi.nlm.nih.gov/pubmed/26248465) - **RNA-Seq** basic analysis for a **paired-end** experiment. - A corresponded input [FASTQ](http://maq.sourceforge.net/fastq.shtml) file has to be provided. - - Current workflow should be used only with the paired-end RNA-Seq data. It performs the following steps: - 1. Use STAR to align reads from input FASTQ files according to the predefined reference indices; generate unsorted BAM file and alignment statistics file - 2. Use fastx_quality_stats to analyze input FASTQ files and generate quality statistics files - 3. Use samtools sort to generate coordinate sorted BAM(+BAI) file pair from the unsorted BAM file obtained on the step 1 (after running STAR) - 4. Generate BigWig file on the base of sorted BAM file - 5. Map input FASTQ files to predefined rRNA reference indices using Bowtie to define the level of rRNA contamination; export resulted statistics to file - 6. Calculate isoform expression level for the sorted BAM file and GTF/TAB annotation file using GEEP reads-counting utility; export results to file \ No newline at end of file diff --git a/workflows/rnaseq-se-dutp-mitochondrial.cwl b/workflows/rnaseq-se-dutp-mitochondrial.cwl deleted file mode 100644 index a3302916..00000000 --- a/workflows/rnaseq-se-dutp-mitochondrial.cwl +++ /dev/null @@ -1,574 +0,0 @@ -cwlVersion: v1.0 -class: Workflow - -requirements: - - class: SubworkflowFeatureRequirement - - class: StepInputExpressionRequirement - - class: MultipleInputFeatureRequirement - - class: InlineJavascriptRequirement - expressionLib: - - var get_root = function(basename) { - return basename.split('.').slice(0,1).join('.'); - }; - -'sd:metadata': - - "../metadata/rnaseq-header.cwl" - - -'sd:upstream': - genome_indices: "genome-indices.cwl" - - -inputs: - -# General inputs - - star_indices_folder: - type: Directory - label: "STAR indices folder" - 'sd:upstreamSource': "genome_indices/star_indices" - doc: "Path to STAR generated indices" - - star_indices_folder_mitochondrial: - type: Directory - label: "STAR indices mitochondrial folder" - 'sd:upstreamSource': "genome_indices/mitochondrial_indices" - doc: "Path to STAR generated indices for mitochondrial dna" - - bowtie_indices_folder: - type: Directory - label: "BowTie Ribosomal Indices" - 'sd:upstreamSource': "genome_indices/ribosomal_indices" - doc: "Path to Bowtie generated indices" - - chrom_length_file: - type: File - label: "Chromosome length file" - format: "http://edamontology.org/format_2330" - 'sd:upstreamSource': "genome_indices/chrom_length" - doc: "Chromosome length file" - - annotation_file: - type: File - label: "Annotation file" - format: - - "http://edamontology.org/format_2306" - - "http://edamontology.org/format_3475" - 'sd:upstreamSource': "genome_indices/annotation" - doc: "GTF or TAB-separated annotation file" - - fastq_file: - type: File - label: "FASTQ input file" - format: "http://edamontology.org/format_1930" - doc: "Reads data in a FASTQ format" - -# Advanced inputs - - exclude_chr: - type: string? - 'sd:layout': - advanced: true - label: "Chromosome to be excluded in rpkm calculation" - doc: "Chromosome to be excluded in rpkm calculation" - - clip_3p_end: - type: int? - default: 0 - 'sd:layout': - advanced: true - label: "Clip from 3p end" - doc: "Number of bases to clip from the 3p end" - - clip_5p_end: - type: int? - default: 0 - 'sd:layout': - advanced: true - label: "Clip from 5p end" - doc: "Number of bases to clip from the 5p end" - -# System dependent - - threads: - type: int? - default: 2 - 'sd:layout': - advanced: true - label: "Number of threads" - doc: "Number of threads for those steps that support multithreading" - -outputs: - - bigwig_upstream: - type: File - format: "http://edamontology.org/format_3006" - label: "BigWig file" - doc: "Generated BigWig file for (+)strand reads" - outputSource: bam_to_bigwig_upstream/bigwig_file - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - type: 'wig' - name: "(+)strand BigWig" - height: 120 - - bigwig_downstream: - type: File - format: "http://edamontology.org/format_3006" - label: "BigWig file" - doc: "Generated BigWig file for (-)strand reads" - outputSource: bam_to_bigwig_downstream/bigwig_file - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - type: 'wig' - name: "(-)strand BigWig" - height: 120 - - star_final_log: - type: File - format: "http://edamontology.org/format_2330" - label: "STAR final log" - doc: "STAR Log.final.out" - outputSource: star_aligner/log_final - - star_out_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR log out" - doc: "STAR Log.out" - outputSource: star_aligner/log_out - - star_progress_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR progress log" - doc: "STAR Log.progress.out" - outputSource: star_aligner/log_progress - - star_stdout_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR stdout log" - doc: "STAR Log.std.out" - outputSource: star_aligner/log_std - - star_sj_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR sj log" - doc: "STAR SJ.out.tab" - outputSource: star_aligner/log_sj - - fastx_statistics: - type: File - format: "http://edamontology.org/format_2330" - label: "FASTQ statistics" - doc: "fastx_quality_stats generated FASTQ file quality statistics file" - outputSource: fastx_quality_stats/statistics_file - 'sd:visualPlugins': - - line: - tab: 'QC Plots' - Title: 'Base frequency plot' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Frequency' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$13, $14, $15, $16, $17] - - boxplot: - tab: 'QC Plots' - Title: 'Quality Control' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Quality score' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$11, $7, $8, $9, $12] - - bam_merged_index: - type: File - format: "http://edamontology.org/format_2572" - label: "Coordinate sorted BAM alignment file (+index BAI)" - doc: "Coordinate sorted BAM file and BAI index file" - outputSource: merge_original_and_mitochondrial_index/bam_bai_pair - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - optional: true - type: 'alignment' - format: 'bam' - name: "BAM Track" - displayMode: "SQUISHED" - - bowtie_log: - type: File - format: "http://edamontology.org/format_2330" - label: "Bowtie alignment log" - doc: "Bowtie alignment log file" - outputSource: bowtie_aligner/log_file - - rpkm_isoforms: - type: File - format: "http://edamontology.org/format_3752" - label: "RPKM, grouped by isoforms" - doc: "Calculated rpkm values, grouped by isoforms" - outputSource: rpkm_calculation/isoforms_file - - rpkm_genes: - type: File - format: "http://edamontology.org/format_3475" - label: "RPKM, grouped by gene name" - doc: "Calculated rpkm values, grouped by gene name" - outputSource: group_isoforms/genes_file - 'sd:visualPlugins': - - syncfusiongrid: - tab: 'Gene Expression' - Title: 'RPKM, grouped by gene name' - - rpkm_common_tss: - type: File - format: "http://edamontology.org/format_3475" - label: "RPKM, grouped by common TSS" - doc: "Calculated rpkm values, grouped by common TSS" - outputSource: group_isoforms/common_tss_file - - htseq_count_gene_expression_file: - type: File - format: "http://edamontology.org/format_3475" - label: "HTSeq: read counts grouped by gene_id" - doc: "HTSeq: read counts grouped by gene_id" - outputSource: htseq_count_gene_expression/feature_counts_report_file - - htseq_count_stdout_log: - type: File - format: "http://edamontology.org/format_2330" - label: "HTSeq: stdout log" - doc: "HTSeq: stdout log" - outputSource: htseq_count_gene_expression/stdout_log - - htseq_count_stderr_log: - type: File - format: "http://edamontology.org/format_2330" - label: "HTSeq: stderr log" - doc: "HTSeq: stderr log" - outputSource: htseq_count_gene_expression/stderr_log - - get_stat_log: - type: File? - label: "YAML formatted combined log" - format: "http://edamontology.org/format_3750" - doc: "YAML formatted combined log" - outputSource: get_stat/collected_statistics_yaml - - get_stat_markdown: - type: File? - label: "Markdown formatted combined log" - format: "http://edamontology.org/format_3835" - doc: "Markdown formatted combined log" - outputSource: get_stat/collected_statistics_md - 'sd:visualPlugins': - - markdownView: - tab: 'Overview' - - get_formatted_stats: - type: File? - label: "Bowtie, STAR and GEEP mapping stats" - format: "http://edamontology.org/format_2330" - doc: "Processed and combined Bowtie & STAR aligner and GEEP logs" - outputSource: get_stat/collected_statistics_tsv - 'sd:visualPlugins': - - tableView: - vertical: true - tab: 'Overview' - 'sd:preview': - 'sd:visualPlugins': - - pie: - colors: ['#b3de69', '#99c0db', '#fdc381', '#fb8072'] - data: [$2, $3, $4, $5] - - bam_statistics_report: - type: File - label: "BAM statistics report" - format: "http://edamontology.org/format_2330" - doc: "BAM statistics report (right after alignment and sorting)" - outputSource: get_bam_statistics/log_file - - -steps: - - extract_fastq: - run: ../tools/extract-fastq.cwl - in: - compressed_file: fastq_file - out: [fastq_file] - - star_aligner: - run: ../tools/star-alignreads.cwl - in: - readFilesIn: extract_fastq/fastq_file - genomeDir: star_indices_folder - outFilterMultimapNmax: - default: 1 - outFilterMismatchNmax: - default: 5 - alignSJDBoverhangMin: - default: 1 - seedSearchStartLmax: - default: 15 - outReadsUnmapped: - default: "Fastx" - clip3pNbases: clip_3p_end - clip5pNbases: clip_5p_end - threads: threads - out: - - aligned_file - - unmapped_mate_1_file - - log_final - - uniquely_mapped_reads_number - - log_out - - log_progress - - log_std - - log_sj - - star_aligner_mitochondrial: - run: ../tools/star-alignreads.cwl - in: - readFilesIn: star_aligner/unmapped_mate_1_file - genomeDir: star_indices_folder_mitochondrial - outFilterMultimapNmax: - default: 1 - outFilterMismatchNmax: - default: 5 - alignSJDBoverhangMin: - default: 1 - seedSearchStartLmax: - default: 15 - clip3pNbases: clip_3p_end - clip5pNbases: clip_5p_end - threads: threads - out: - - aligned_file - - log_final - - uniquely_mapped_reads_number - - log_out - - log_progress - - log_std - - log_sj - - fastx_quality_stats: - run: ../tools/fastx-quality-stats.cwl - in: - input_file: extract_fastq/fastq_file - out: [statistics_file] - - samtools_sort_index_mitochondrial: - run: ../tools/samtools-sort-index.cwl - in: - sort_input: star_aligner_mitochondrial/aligned_file - sort_output_filename: - source: extract_fastq/fastq_file - valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'_mitochondrial.bam') - threads: threads - out: [bam_bai_pair] - - samtools_sort_index: - run: ../tools/samtools-sort-index.cwl - in: - sort_input: star_aligner/aligned_file - sort_output_filename: - source: extract_fastq/fastq_file - valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'_sorted.bam') - threads: threads - out: [bam_bai_pair] - - merge_original_and_mitochondrial: - run: ../tools/samtools-merge.cwl - in: - output_filename: - source: extract_fastq/fastq_file - valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'_merged.bam') - alignment_files: [ samtools_sort_index/bam_bai_pair, samtools_sort_index_mitochondrial/bam_bai_pair ] - out: [merged_alignment_file] - - merge_original_and_mitochondrial_index: - run: ../tools/samtools-sort-index.cwl - in: - sort_input: merge_original_and_mitochondrial/merged_alignment_file - sort_output_filename: - source: extract_fastq/fastq_file - valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'.bam') - threads: threads - out: [bam_bai_pair] - - bam_to_bigwig_upstream: - run: ../tools/bam-bedgraph-bigwig.cwl - in: - bam_file: merge_original_and_mitochondrial_index/bam_bai_pair - chrom_length_file: chrom_length_file - mapped_reads_number: star_aligner/uniquely_mapped_reads_number - bigwig_filename: - source: extract_fastq/fastq_file - valueFrom: | - ${ - var root = self.basename.split('.').slice(0,-1).join('.'); - var ext = "_upstream.bigWig"; - return (root == "")?self.basename+ext:root+ext; - } - strand: - default: '+' - out: [bigwig_file] - - bam_to_bigwig_downstream: - run: ../tools/bam-bedgraph-bigwig.cwl - in: - bam_file: merge_original_and_mitochondrial_index/bam_bai_pair - chrom_length_file: chrom_length_file - mapped_reads_number: - source: star_aligner/uniquely_mapped_reads_number - valueFrom: $(-self) - bigwig_filename: - source: extract_fastq/fastq_file - valueFrom: | - ${ - var root = self.basename.split('.').slice(0,-1).join('.'); - var ext = "_downstream.bigWig"; - return (root == "")?self.basename+ext:root+ext; - } - strand: - default: '-' - out: [bigwig_file] - - bowtie_aligner: - run: ../tools/bowtie-alignreads.cwl - in: - upstream_filelist: extract_fastq/fastq_file - indices_folder: bowtie_indices_folder - clip_3p_end: clip_3p_end - clip_5p_end: clip_5p_end - v: - default: 3 - m: - default: 1 - best: - default: true - strata: - default: true - sam: - default: true - threads: threads - out: [log_file] - - rpkm_calculation: - run: ../tools/geep.cwl - in: - bam_file: merge_original_and_mitochondrial_index/bam_bai_pair - annotation_file: annotation_file - dutp: - default: true - rpkm_threshold: - default: 0.001 - exclude_chr: exclude_chr - threads: threads - out: [isoforms_file] - - group_isoforms: - run: ../tools/group-isoforms.cwl - in: - isoforms_file: rpkm_calculation/isoforms_file - out: - - genes_file - - common_tss_file - - get_annotation_gtf: - run: ../tools/ucsc-genepredtogtf.cwl - in: - annotation_tsv_file: annotation_file - out: - - annotation_gtf_file - - htseq_count_gene_expression: - run: ../tools/htseq-count.cwl - in: - alignment_bam_file: merge_original_and_mitochondrial_index/bam_bai_pair - annotation_gtf_file: get_annotation_gtf/annotation_gtf_file - strand_specific: - default: "reverse" - feature_type: - default: "exon" - feature_id: - default: "gene_id" - out: - - feature_counts_report_file - - stdout_log - - stderr_log - - get_bam_statistics: - run: ../tools/samtools-stats.cwl - in: - bambai_pair: samtools_sort_index/bam_bai_pair - output_filename: - source: samtools_sort_index/bam_bai_pair - valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") - out: [log_file] - - get_stat: - run: ../tools/collect-statistics-rna-seq.cwl - in: - star_alignment_report: star_aligner/log_final - bowtie_alignment_report: bowtie_aligner/log_file - bam_statistics_report: get_bam_statistics/log_file - isoforms_file: rpkm_calculation/isoforms_file - out: [collected_statistics_yaml, collected_statistics_tsv, collected_statistics_md] - -$namespaces: - s: http://schema.org/ - -$schemas: -- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf - -s:name: "RNA-Seq pipeline single-read stranded mitochondrial" -label: "RNA-Seq pipeline single-read stranded mitochondrial" -s:alternateName: "RNA-Seq strand specific mitochondrial workflow for single-read experiment based on BioWardrobe's basic analysis" - -s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/rnaseq-se-dutp-mitochondrial.cwl -s:codeRepository: https://github.com/datirium/workflows -s:license: http://www.apache.org/licenses/LICENSE-2.0 - -s:isPartOf: - class: s:CreativeWork - s:name: Common Workflow Language - s:url: http://commonwl.org/ - -s:creator: -- class: s:Organization - s:legalName: "Datirium, LLC" - s:member: - - class: s:Person - s:name: Artem BArski - s:email: mailto:Artem.Barski@datirum.com - - class: s:Person - s:name: Andrey Kartashov - s:email: mailto:Andrey.Kartashov@datirium.com - s:sameAs: - - id: http://orcid.org/0000-0001-9102-5681 - - -# doc: -# $include: ../descriptions/rnaseq-se-dutp-mitochondrial.md - - -doc: | - Slightly changed original [BioWardrobe's](https://biowardrobe.com) [PubMed ID:26248465](https://www.ncbi.nlm.nih.gov/pubmed/26248465) - **RNA-Seq** basic analysis for **strand specific single-read** experiment. - An additional steps were added to map data to mitochondrial chromosome only and then merge the output. - - Experiment files in [FASTQ](http://maq.sourceforge.net/fastq.shtml) format either compressed or not can be used. - - Current workflow should be used only with single-read strand specific RNA-Seq data. It performs the following steps: - 1. `STAR` to align reads from input FASTQ file according to the predefined reference indices; generate unsorted BAM file and alignment statistics file - 2. `fastx_quality_stats` to analyze input FASTQ file and generate quality statistics file - 3. `samtools sort` to generate coordinate sorted BAM(+BAI) file pair from the unsorted BAM file obtained on the step 1 (after running STAR) - 5. Generate BigWig file on the base of sorted BAM file - 6. Map input FASTQ file to predefined rRNA reference indices using Bowtie to define the level of rRNA contamination; export resulted statistics to file - 7. Calculate isoform expression level for the sorted BAM file and GTF/TAB annotation file using `GEEP` reads-counting utility; export results to file diff --git a/workflows/rnaseq-se-dutp.cwl b/workflows/rnaseq-se-dutp.cwl deleted file mode 100644 index 74321f63..00000000 --- a/workflows/rnaseq-se-dutp.cwl +++ /dev/null @@ -1,527 +0,0 @@ -cwlVersion: v1.0 -class: Workflow - - -requirements: - - class: SubworkflowFeatureRequirement - - class: StepInputExpressionRequirement - - class: InlineJavascriptRequirement - expressionLib: - - var get_root = function(basename) { - return basename.split('.').slice(0,1).join('.'); - }; - - -'sd:metadata': - - "../metadata/rnaseq-header.cwl" - -'sd:upstream': - genome_indices: "genome-indices.cwl" - - -inputs: - -# General inputs - - star_indices_folder: - type: Directory - label: "STAR indices folder" - 'sd:upstreamSource': "genome_indices/star_indices" - doc: "Path to STAR generated indices" - - bowtie_indices_folder: - type: Directory - label: "BowTie Ribosomal Indices" - 'sd:upstreamSource': "genome_indices/ribosomal_indices" - doc: "Path to Bowtie generated indices" - - chrom_length_file: - type: File - label: "Chromosome length file" - format: "http://edamontology.org/format_2330" - 'sd:upstreamSource': "genome_indices/chrom_length" - doc: "Chromosome length file" - - annotation_file: - type: File - label: "Annotation file" - format: - - "http://edamontology.org/format_2306" - - "http://edamontology.org/format_3475" - 'sd:upstreamSource': "genome_indices/annotation" - doc: "GTF or TAB-separated annotation file" - - fastq_file: - type: File - label: "FASTQ input file" - format: "http://edamontology.org/format_1930" - doc: "Reads data in a FASTQ format" - -# Advanced inputs - - exclude_chr: - type: string? - 'sd:layout': - advanced: true - label: "Chromosome to be excluded in rpkm calculation" - doc: "Chromosome to be excluded in rpkm calculation" - - clip_3p_end: - type: int? - default: 0 - 'sd:layout': - advanced: true - label: "Clip from 3p end" - doc: "Number of bases to clip from the 3p end" - - clip_5p_end: - type: int? - default: 0 - 'sd:layout': - advanced: true - label: "Clip from 5p end" - doc: "Number of bases to clip from the 5p end" - -# System dependent - - threads: - type: int? - default: 2 - 'sd:layout': - advanced: true - label: "Number of threads" - doc: "Number of threads for those steps that support multithreading" - -outputs: - - bigwig_upstream: - type: File - format: "http://edamontology.org/format_3006" - label: "BigWig file" - doc: "Generated BigWig file for (+)strand reads" - outputSource: bam_to_bigwig_upstream/bigwig_file - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - type: 'wig' - name: "(+)strand BigWig" - height: 120 - - bigwig_downstream: - type: File - format: "http://edamontology.org/format_3006" - label: "BigWig file" - doc: "Generated BigWig file for (-)strand reads" - outputSource: bam_to_bigwig_downstream/bigwig_file - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - type: 'wig' - name: "(-)strand BigWig" - height: 120 - - star_final_log: - type: File - format: "http://edamontology.org/format_2330" - label: "STAR final log" - doc: "STAR Log.final.out" - outputSource: star_aligner/log_final - - star_out_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR log out" - doc: "STAR Log.out" - outputSource: star_aligner/log_out - - star_progress_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR progress log" - doc: "STAR Log.progress.out" - outputSource: star_aligner/log_progress - - star_stdout_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR stdout log" - doc: "STAR Log.std.out" - outputSource: star_aligner/log_std - - star_sj_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR sj log" - doc: "STAR SJ.out.tab" - outputSource: star_aligner/log_sj - - fastx_statistics: - type: File - format: "http://edamontology.org/format_2330" - label: "FASTQ statistics" - doc: "fastx_quality_stats generated FASTQ file quality statistics file" - outputSource: fastx_quality_stats/statistics_file - 'sd:visualPlugins': - - line: - tab: 'QC Plots' - Title: 'Base frequency plot' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Frequency' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$13, $14, $15, $16, $17] - - boxplot: - tab: 'QC Plots' - Title: 'Quality Control' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Quality score' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$11, $7, $8, $9, $12] - - bambai_pair: - type: File - format: "http://edamontology.org/format_2572" - label: "Coordinate sorted BAM alignment file (+index BAI)" - doc: "Coordinate sorted BAM file and BAI index file" - outputSource: samtools_sort_index/bam_bai_pair - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - optional: true - type: 'alignment' - format: 'bam' - name: "BAM Track" - displayMode: "SQUISHED" - - bowtie_log: - type: File - format: "http://edamontology.org/format_2330" - label: "Bowtie alignment log" - doc: "Bowtie alignment log file" - outputSource: bowtie_aligner/log_file - - rpkm_isoforms: - type: File - format: "http://edamontology.org/format_3752" - label: "RPKM, grouped by isoforms" - doc: "Calculated rpkm values, grouped by isoforms" - outputSource: rpkm_calculation/isoforms_file - - rpkm_genes: - type: File - format: "http://edamontology.org/format_3475" - label: "RPKM, grouped by gene name" - doc: "Calculated rpkm values, grouped by gene name" - outputSource: group_isoforms/genes_file - 'sd:visualPlugins': - - syncfusiongrid: - tab: 'Gene Expression' - Title: 'RPKM, grouped by gene name' - - rpkm_common_tss: - type: File - format: "http://edamontology.org/format_3475" - label: "RPKM, grouped by common TSS" - doc: "Calculated rpkm values, grouped by common TSS" - outputSource: group_isoforms/common_tss_file - - htseq_count_gene_expression_file: - type: File - format: "http://edamontology.org/format_3475" - label: "HTSeq: read counts grouped by gene_id" - doc: "HTSeq: read counts grouped by gene_id" - outputSource: htseq_count_gene_expression/feature_counts_report_file - - htseq_count_stdout_log: - type: File - format: "http://edamontology.org/format_2330" - label: "HTSeq: stdout log" - doc: "HTSeq: stdout log" - outputSource: htseq_count_gene_expression/stdout_log - - htseq_count_stderr_log: - type: File - format: "http://edamontology.org/format_2330" - label: "HTSeq: stderr log" - doc: "HTSeq: stderr log" - outputSource: htseq_count_gene_expression/stderr_log - - get_stat_log: - type: File? - label: "YAML formatted combined log" - format: "http://edamontology.org/format_3750" - doc: "YAML formatted combined log" - outputSource: get_stat/collected_statistics_yaml - - get_stat_markdown: - type: File? - label: "Markdown formatted combined log" - format: "http://edamontology.org/format_3835" - doc: "Markdown formatted combined log" - outputSource: get_stat/collected_statistics_md - 'sd:visualPlugins': - - markdownView: - tab: 'Overview' - - get_formatted_stats: - type: File? - label: "Bowtie, STAR and GEEP mapping stats" - format: "http://edamontology.org/format_2330" - doc: "Processed and combined Bowtie & STAR aligner and GEEP logs" - outputSource: get_stat/collected_statistics_tsv - 'sd:visualPlugins': - - tableView: - vertical: true - tab: 'Overview' - 'sd:preview': - 'sd:visualPlugins': - - pie: - colors: ['#b3de69', '#99c0db', '#fdc381', '#fb8072'] - data: [$2, $3, $4, $5] - - bam_statistics_report: - type: File - label: "BAM statistics report" - format: "http://edamontology.org/format_2330" - doc: "BAM statistics report (right after alignment and sorting)" - outputSource: get_bam_statistics/log_file - -steps: - - extract_fastq: - run: ../tools/extract-fastq.cwl - in: - compressed_file: fastq_file - out: [fastq_file] - - star_aligner: - run: ../tools/star-alignreads.cwl - in: - readFilesIn: extract_fastq/fastq_file - genomeDir: star_indices_folder - outFilterMultimapNmax: - default: 1 - outFilterMismatchNmax: - default: 5 - alignSJDBoverhangMin: - default: 1 - seedSearchStartLmax: - default: 15 - clip3pNbases: clip_3p_end - clip5pNbases: clip_5p_end - threads: threads - out: - - aligned_file - - log_final - - uniquely_mapped_reads_number - - log_out - - log_progress - - log_std - - log_sj - - fastx_quality_stats: - run: ../tools/fastx-quality-stats.cwl - in: - input_file: extract_fastq/fastq_file - out: [statistics_file] - - samtools_sort_index: - run: ../tools/samtools-sort-index.cwl - in: - sort_input: star_aligner/aligned_file - sort_output_filename: - source: extract_fastq/fastq_file - valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'.bam') - threads: threads - out: [bam_bai_pair] - - bam_to_bigwig_upstream: - run: ../tools/bam-bedgraph-bigwig.cwl - in: - bam_file: samtools_sort_index/bam_bai_pair - chrom_length_file: chrom_length_file - mapped_reads_number: star_aligner/uniquely_mapped_reads_number - bigwig_filename: - source: samtools_sort_index/bam_bai_pair - valueFrom: | - ${ - let root = self.basename.split('.').slice(0,-1).join('.'); - let ext = "_upstream.bigWig"; - return (root == "")?self.basename+ext:root+ext; - } - strand: - default: '+' - out: [bigwig_file] - - bam_to_bigwig_downstream: - run: ../tools/bam-bedgraph-bigwig.cwl - in: - bam_file: samtools_sort_index/bam_bai_pair - chrom_length_file: chrom_length_file - mapped_reads_number: - source: star_aligner/uniquely_mapped_reads_number - valueFrom: $(-self) - bigwig_filename: - source: samtools_sort_index/bam_bai_pair - valueFrom: | - ${ - let root = self.basename.split('.').slice(0,-1).join('.'); - let ext = "_downstream.bigWig"; - return (root == "")?self.basename+ext:root+ext; - } - strand: - default: '-' - out: [bigwig_file] - - bowtie_aligner: - run: ../tools/bowtie-alignreads.cwl - in: - upstream_filelist: extract_fastq/fastq_file - indices_folder: bowtie_indices_folder - clip_3p_end: clip_3p_end - clip_5p_end: clip_5p_end - v: - default: 3 - m: - default: 1 - best: - default: true - strata: - default: true - sam: - default: true - threads: threads - out: [log_file] - - rpkm_calculation: - run: ../tools/geep.cwl - in: - bam_file: samtools_sort_index/bam_bai_pair - annotation_file: annotation_file - dutp: - default: true - rpkm_threshold: - default: 0.001 - exclude_chr: exclude_chr - threads: threads - out: [isoforms_file] - - group_isoforms: - run: ../tools/group-isoforms.cwl - in: - isoforms_file: rpkm_calculation/isoforms_file - out: - - genes_file - - common_tss_file - - get_annotation_gtf: - run: ../tools/ucsc-genepredtogtf.cwl - in: - annotation_tsv_file: annotation_file - out: - - annotation_gtf_file - - htseq_count_gene_expression: - run: ../tools/htseq-count.cwl - in: - alignment_bam_file: samtools_sort_index/bam_bai_pair - annotation_gtf_file: get_annotation_gtf/annotation_gtf_file - strand_specific: - default: "reverse" - feature_type: - default: "exon" - feature_id: - default: "gene_id" - out: - - feature_counts_report_file - - stdout_log - - stderr_log - - get_bam_statistics: - run: ../tools/samtools-stats.cwl - in: - bambai_pair: samtools_sort_index/bam_bai_pair - output_filename: - source: samtools_sort_index/bam_bai_pair - valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") - out: [log_file] - - get_stat: - run: ../tools/collect-statistics-rna-seq.cwl - in: - star_alignment_report: star_aligner/log_final - bowtie_alignment_report: bowtie_aligner/log_file - bam_statistics_report: get_bam_statistics/log_file - isoforms_file: rpkm_calculation/isoforms_file - out: [collected_statistics_yaml, collected_statistics_tsv, collected_statistics_md] - - -$namespaces: - s: http://schema.org/ - -$schemas: -- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf - -s:name: "Deprecated. RNA-Seq pipeline single-read strand specific" -label: "Deprecated. RNA-Seq pipeline single-read strand specific" -s:alternateName: "RNA-Seq basic analysis workflow for strand specific single-read experiment" - -s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/rnaseq-se-dutp.cwl -s:codeRepository: https://github.com/datirium/workflows -s:license: http://www.apache.org/licenses/LICENSE-2.0 - -s:isPartOf: - class: s:CreativeWork - s:name: Common Workflow Language - s:url: http://commonwl.org/ - -s:creator: -- class: s:Organization - s:legalName: "Cincinnati Children's Hospital Medical Center" - s:location: - - class: s:PostalAddress - s:addressCountry: "USA" - s:addressLocality: "Cincinnati" - s:addressRegion: "OH" - s:postalCode: "45229" - s:streetAddress: "3333 Burnet Ave" - s:telephone: "+1(513)636-4200" - s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" - s:department: - - class: s:Organization - s:legalName: "Allergy and Immunology" - s:department: - - class: s:Organization - s:legalName: "Barski Research Lab" - s:member: - - class: s:Person - s:name: Michael Kotliar - s:email: mailto:misha.kotliar@gmail.com - s:sameAs: - - id: http://orcid.org/0000-0002-6486-3898 - - class: s:Person - s:name: Andrey Kartashov - s:email: mailto:Andrey.Kartashov@cchmc.org - s:sameAs: - - id: http://orcid.org/0000-0001-9102-5681 - - -# doc: -# $include: ../descriptions/rnaseq-se-dutp.md - - -doc: | - Note: should be updated - The original [BioWardrobe's](https://biowardrobe.com) [PubMed ID:26248465](https://www.ncbi.nlm.nih.gov/pubmed/26248465) - **RNA-Seq** basic analysis for **strand specific single-read** experiment. - A corresponded input [FASTQ](http://maq.sourceforge.net/fastq.shtml) file has to be provided. - - Current workflow should be used only with the single-read RNA-Seq data. It performs the following steps: - 1. Use STAR to align reads from input FASTQ file according to the predefined reference indices; generate unsorted BAM file and alignment statistics file - 2. Use fastx_quality_stats to analyze input FASTQ file and generate quality statistics file - 3. Use samtools sort to generate coordinate sorted BAM(+BAI) file pair from the unsorted BAM file obtained on the step 1 (after running STAR) - 5. Generate BigWig file on the base of sorted BAM file - 6. Map input FASTQ file to predefined rRNA reference indices using Bowtie to define the level of rRNA contamination; export resulted statistics to file - 7. Calculate isoform expression level for the sorted BAM file and GTF/TAB annotation file using GEEP reads-counting utility; export results to file \ No newline at end of file diff --git a/workflows/rnaseq-se.cwl b/workflows/rnaseq-se.cwl deleted file mode 100644 index 5d50ecf9..00000000 --- a/workflows/rnaseq-se.cwl +++ /dev/null @@ -1,480 +0,0 @@ -cwlVersion: v1.0 -class: Workflow - -requirements: - - class: SubworkflowFeatureRequirement - - class: StepInputExpressionRequirement - - class: InlineJavascriptRequirement - expressionLib: - - var get_root = function(basename) { - return basename.split('.').slice(0,1).join('.'); - }; - - -'sd:metadata': - - "../metadata/rnaseq-header.cwl" - -'sd:upstream': - genome_indices: "genome-indices.cwl" - -inputs: - -# General inputs - - star_indices_folder: - type: Directory - label: "STAR indices folder" - 'sd:upstreamSource': "genome_indices/star_indices" - doc: "Path to STAR generated indices" - - bowtie_indices_folder: - type: Directory - label: "BowTie Ribosomal Indices" - 'sd:upstreamSource': "genome_indices/ribosomal_indices" - doc: "Path to Bowtie generated indices" - - chrom_length_file: - type: File - label: "Chromosome length file" - format: "http://edamontology.org/format_2330" - 'sd:upstreamSource': "genome_indices/chrom_length" - doc: "Chromosome length file" - - annotation_file: - type: File - label: "Annotation file" - format: - - "http://edamontology.org/format_2306" - - "http://edamontology.org/format_3475" - 'sd:upstreamSource': "genome_indices/annotation" - doc: "GTF or TAB-separated annotation file" - - fastq_file: - type: File - label: "FASTQ input file" - format: "http://edamontology.org/format_1930" - doc: "Reads data in a FASTQ format" - -# Advanced inputs - - exclude_chr: - type: string? - 'sd:layout': - advanced: true - label: "Chromosome to be excluded in rpkm calculation" - doc: "Chromosome to be excluded in rpkm calculation" - - clip_3p_end: - type: int? - default: 0 - 'sd:layout': - advanced: true - label: "Clip from 3p end" - doc: "Number of bases to clip from the 3p end" - - clip_5p_end: - type: int? - default: 0 - 'sd:layout': - advanced: true - label: "Clip from 5p end" - doc: "Number of bases to clip from the 5p end" - -# System dependent - - threads: - type: int? - default: 2 - 'sd:layout': - advanced: true - label: "Number of threads" - doc: "Number of threads for those steps that support multithreading" - -outputs: - - bigwig: - type: File - format: "http://edamontology.org/format_3006" - label: "BigWig file" - doc: "Generated BigWig file" - outputSource: bam_to_bigwig/bigwig_file - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - type: 'wig' - name: "BigWig Track" - height: 120 - - star_final_log: - type: File - format: "http://edamontology.org/format_2330" - label: "STAR final log" - doc: "STAR Log.final.out" - outputSource: star_aligner/log_final - - star_out_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR log out" - doc: "STAR Log.out" - outputSource: star_aligner/log_out - - star_progress_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR progress log" - doc: "STAR Log.progress.out" - outputSource: star_aligner/log_progress - - star_stdout_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR stdout log" - doc: "STAR Log.std.out" - outputSource: star_aligner/log_std - - star_sj_log: - type: File? - format: "http://edamontology.org/format_2330" - label: "STAR sj log" - doc: "STAR SJ.out.tab" - outputSource: star_aligner/log_sj - - fastx_statistics: - type: File - format: "http://edamontology.org/format_2330" - label: "FASTQ statistics" - doc: "fastx_quality_stats generated FASTQ file quality statistics file" - outputSource: fastx_quality_stats/statistics_file - 'sd:visualPlugins': - - line: - tab: 'QC Plots' - Title: 'Base frequency plot' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Frequency' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$13, $14, $15, $16, $17] - - boxplot: - tab: 'QC Plots' - Title: 'Quality Control' - xAxisTitle: 'Nucleotide position' - yAxisTitle: 'Quality score' - colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] - data: [$11, $7, $8, $9, $12] - - bambai_pair: - type: File - format: "http://edamontology.org/format_2572" - label: "Coordinate sorted BAM alignment file (+index BAI)" - doc: "Coordinate sorted BAM file and BAI index file" - outputSource: samtools_sort_index/bam_bai_pair - 'sd:visualPlugins': - - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - optional: true - type: 'alignment' - format: 'bam' - name: "BAM Track" - displayMode: "SQUISHED" - - bowtie_log: - type: File - format: "http://edamontology.org/format_2330" - label: "Bowtie alignment log" - doc: "Bowtie alignment log file" - outputSource: bowtie_aligner/log_file - - rpkm_isoforms: - type: File - format: "http://edamontology.org/format_3752" - label: "RPKM, grouped by isoforms" - doc: "Calculated rpkm values, grouped by isoforms" - outputSource: rpkm_calculation/isoforms_file - - rpkm_genes: - type: File - format: "http://edamontology.org/format_3475" - label: "RPKM, grouped by gene name" - doc: "Calculated rpkm values, grouped by gene name" - outputSource: group_isoforms/genes_file - 'sd:visualPlugins': - - syncfusiongrid: - tab: 'Gene Expression' - Title: 'RPKM, grouped by gene name' - - rpkm_common_tss: - type: File - format: "http://edamontology.org/format_3475" - label: "RPKM, grouped by common TSS" - doc: "Calculated rpkm values, grouped by common TSS" - outputSource: group_isoforms/common_tss_file - - htseq_count_gene_expression_file: - type: File - format: "http://edamontology.org/format_3475" - label: "HTSeq: read counts grouped by gene_id" - doc: "HTSeq: read counts grouped by gene_id" - outputSource: htseq_count_gene_expression/feature_counts_report_file - - htseq_count_stdout_log: - type: File - format: "http://edamontology.org/format_2330" - label: "HTSeq: stdout log" - doc: "HTSeq: stdout log" - outputSource: htseq_count_gene_expression/stdout_log - - htseq_count_stderr_log: - type: File - format: "http://edamontology.org/format_2330" - label: "HTSeq: stderr log" - doc: "HTSeq: stderr log" - outputSource: htseq_count_gene_expression/stderr_log - - get_stat_log: - type: File? - label: "YAML formatted combined log" - format: "http://edamontology.org/format_3750" - doc: "YAML formatted combined log" - outputSource: get_stat/collected_statistics_yaml - - get_stat_markdown: - type: File? - label: "Markdown formatted combined log" - format: "http://edamontology.org/format_3835" - doc: "Markdown formatted combined log" - outputSource: get_stat/collected_statistics_md - 'sd:visualPlugins': - - markdownView: - tab: 'Overview' - - get_formatted_stats: - type: File? - label: "Bowtie, STAR and GEEP mapping stats" - format: "http://edamontology.org/format_2330" - doc: "Processed and combined Bowtie & STAR aligner and GEEP logs" - outputSource: get_stat/collected_statistics_tsv - 'sd:visualPlugins': - - tableView: - vertical: true - tab: 'Overview' - 'sd:preview': - 'sd:visualPlugins': - - pie: - colors: ['#b3de69', '#99c0db', '#fdc381', '#fb8072'] - data: [$2, $3, $4, $5] - - bam_statistics_report: - type: File - label: "BAM statistics report" - format: "http://edamontology.org/format_2330" - doc: "BAM statistics report (right after alignment and sorting)" - outputSource: get_bam_statistics/log_file - - -steps: - - extract_fastq: - run: ../tools/extract-fastq.cwl - in: - compressed_file: fastq_file - out: [fastq_file] - - star_aligner: - run: ../tools/star-alignreads.cwl - in: - readFilesIn: extract_fastq/fastq_file - genomeDir: star_indices_folder - outFilterMultimapNmax: - default: 1 - outFilterMismatchNmax: - default: 5 - alignSJDBoverhangMin: - default: 1 - seedSearchStartLmax: - default: 15 - clip3pNbases: clip_3p_end - clip5pNbases: clip_5p_end - threads: threads - out: - - aligned_file - - log_final - - uniquely_mapped_reads_number - - log_out - - log_progress - - log_std - - log_sj - - fastx_quality_stats: - run: ../tools/fastx-quality-stats.cwl - in: - input_file: extract_fastq/fastq_file - out: [statistics_file] - - samtools_sort_index: - run: ../tools/samtools-sort-index.cwl - in: - sort_input: star_aligner/aligned_file - sort_output_filename: - source: extract_fastq/fastq_file - valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'.bam') - threads: threads - out: [bam_bai_pair] - - bam_to_bigwig: - run: ../tools/bam-bedgraph-bigwig.cwl - in: - bam_file: samtools_sort_index/bam_bai_pair - chrom_length_file: chrom_length_file - mapped_reads_number: star_aligner/uniquely_mapped_reads_number -# fragmentsize is not set (STAR gives only read length). It will be calculated automatically by bedtools genomecov. - out: [bigwig_file] - - bowtie_aligner: - run: ../tools/bowtie-alignreads.cwl - in: - upstream_filelist: extract_fastq/fastq_file - indices_folder: bowtie_indices_folder - clip_3p_end: clip_3p_end - clip_5p_end: clip_5p_end - v: - default: 3 - m: - default: 1 - best: - default: true - strata: - default: true - sam: - default: true - threads: threads - out: [log_file] - - rpkm_calculation: - run: ../tools/geep.cwl - in: - bam_file: samtools_sort_index/bam_bai_pair - annotation_file: annotation_file - rpkm_threshold: - default: 0.001 - exclude_chr: exclude_chr - threads: threads - out: [isoforms_file] - - group_isoforms: - run: ../tools/group-isoforms.cwl - in: - isoforms_file: rpkm_calculation/isoforms_file - out: - - genes_file - - common_tss_file - - get_annotation_gtf: - run: ../tools/ucsc-genepredtogtf.cwl - in: - annotation_tsv_file: annotation_file - out: - - annotation_gtf_file - - htseq_count_gene_expression: - run: ../tools/htseq-count.cwl - in: - alignment_bam_file: samtools_sort_index/bam_bai_pair - annotation_gtf_file: get_annotation_gtf/annotation_gtf_file - strand_specific: - default: "no" - feature_type: - default: "exon" - feature_id: - default: "gene_id" - out: - - feature_counts_report_file - - stdout_log - - stderr_log - - get_bam_statistics: - run: ../tools/samtools-stats.cwl - in: - bambai_pair: samtools_sort_index/bam_bai_pair - output_filename: - source: samtools_sort_index/bam_bai_pair - valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") - out: [log_file] - - get_stat: - run: ../tools/collect-statistics-rna-seq.cwl - in: - star_alignment_report: star_aligner/log_final - bowtie_alignment_report: bowtie_aligner/log_file - bam_statistics_report: get_bam_statistics/log_file - isoforms_file: rpkm_calculation/isoforms_file - out: [collected_statistics_yaml, collected_statistics_tsv, collected_statistics_md] - - -$namespaces: - s: http://schema.org/ - -$schemas: -- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf - -s:name: "Deprecated. RNA-Seq pipeline single-read" -label: "Deprecated. RNA-Seq pipeline single-read" -s:alternateName: "RNA-Seq basic analysis workflow for single-read experiment" - -s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/rnaseq-se.cwl -s:codeRepository: https://github.com/datirium/workflows -s:license: http://www.apache.org/licenses/LICENSE-2.0 - -s:isPartOf: - class: s:CreativeWork - s:name: Common Workflow Language - s:url: http://commonwl.org/ - -s:creator: -- class: s:Organization - s:legalName: "Cincinnati Children's Hospital Medical Center" - s:location: - - class: s:PostalAddress - s:addressCountry: "USA" - s:addressLocality: "Cincinnati" - s:addressRegion: "OH" - s:postalCode: "45229" - s:streetAddress: "3333 Burnet Ave" - s:telephone: "+1(513)636-4200" - s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" - s:department: - - class: s:Organization - s:legalName: "Allergy and Immunology" - s:department: - - class: s:Organization - s:legalName: "Barski Research Lab" - s:member: - - class: s:Person - s:name: Michael Kotliar - s:email: mailto:misha.kotliar@gmail.com - s:sameAs: - - id: http://orcid.org/0000-0002-6486-3898 - - class: s:Person - s:name: Andrey Kartashov - s:email: mailto:Andrey.Kartashov@cchmc.org - s:sameAs: - - id: http://orcid.org/0000-0001-9102-5681 - - -# doc: -# $include: ../descriptions/rnaseq-se.md - - -doc: | - The original [BioWardrobe's](https://biowardrobe.com) [PubMed ID:26248465](https://www.ncbi.nlm.nih.gov/pubmed/26248465) - **RNA-Seq** basic analysis for a **single-read** experiment. - A corresponded input [FASTQ](http://maq.sourceforge.net/fastq.shtml) file has to be provided. - - Current workflow should be used only with the single-read RNA-Seq data. It performs the following steps: - 1. Use STAR to align reads from input FASTQ file according to the predefined reference indices; generate unsorted BAM file and alignment statistics file - 2. Use fastx_quality_stats to analyze input FASTQ file and generate quality statistics file - 3. Use samtools sort to generate coordinate sorted BAM(+BAI) file pair from the unsorted BAM file obtained on the step 1 (after running STAR) - 5. Generate BigWig file on the base of sorted BAM file - 6. Map input FASTQ file to predefined rRNA reference indices using Bowtie to define the level of rRNA contamination; export resulted statistics to file - 7. Calculate isoform expression level for the sorted BAM file and GTF/TAB annotation file using GEEP reads-counting utility; export results to file \ No newline at end of file diff --git a/workflows/sc-assign-cell-types.cwl b/workflows/sc-assign-cell-types.cwl deleted file mode 100644 index 98278dd4..00000000 --- a/workflows/sc-assign-cell-types.cwl +++ /dev/null @@ -1,337 +0,0 @@ -cwlVersion: v1.0 -class: Workflow - - -requirements: - - class: SubworkflowFeatureRequirement - - class: StepInputExpressionRequirement - - class: MultipleInputFeatureRequirement - - class: InlineJavascriptRequirement - expressionLib: - - var split_features = function(line) { - function get_unique(value, index, self) { - return self.indexOf(value) === index && value != ""; - } - var splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; - return (splitted_line && !!splitted_line.length)?splitted_line:null; - }; - - var get_source_column = function(resolution, from_aggregated) { - if (from_aggregated) { - return "integrated_snn_res."+resolution; - } else { - return "RNA_snn_res."+resolution; - } - }; - - var get_target_column = function(resolution) { - return "cluster_ext_type_res."+resolution; - }; - - -'sd:upstream': - seurat_cluster_sample: - - "seurat-cluster.cwl" - - -inputs: - - alias: - type: string - label: "Experiment short name/Alias" - sd:preview: - position: 1 - - seurat_data_rds: - type: File - label: "Seurat Cluster Experiment" - doc: | - Path to the RDS file to load Seurat object from. - RDS file can be produced by run_seurat.R script. - 'sd:upstreamSource': "seurat_cluster_sample/seurat_clst_data_rds" - 'sd:localLabel': true - - from_aggregated: - type: boolean? - default: true - label: "Treat Seurat Cluster Experiment as aggregated" - doc: | - If set to true the 'source_column' and 'target_column' inputs will have - prefix 'integrated_snn_res.{resolution}', otherwise 'RNA_res.{resolution}' - - resolution: - type: string - label: "Clustering resolution to assign cell types to" - doc: | - Clustering resolution define 'source_column' and 'target_column' - inputs for 'assign_cell_types' step - - cell_type_data: - type: File - label: "TSV/CSV cell types metadata file with 'cluster' and 'type' columns" - doc: | - Path to the cell types metadata TSV/CSV file with - "cluster" and "type" columns - - selected_features: - type: string? - default: null - label: "Comma or space separated list of genes of interest" - doc: | - Features of interest to evaluate expression. - 'sd:layout': - advanced: true - - threads: - type: int? - default: 2 - label: "Threads number to use" - doc: | - Threads number - 'sd:layout': - advanced: true - - -outputs: - - umap_ctype_plot_png: - type: File? - outputSource: assign_cell_types/umap_ctype_plot_png - label: "Grouped by cell type UMAP projected PCA of filtered integrated/scaled datasets" - doc: | - Grouped by cell type UMAP projected PCA of filtered integrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Cell Types' - Caption: 'Grouped by cell type UMAP projected PCA of filtered integrated/scaled datasets' - - umap_ctype_plot_pdf: - type: File? - outputSource: assign_cell_types/umap_ctype_plot_pdf - label: "Grouped by cell type UMAP projected PCA of filtered integrated/scaled datasets" - doc: | - Grouped by cell type UMAP projected PCA of filtered integrated/scaled datasets. - PDF format - - umap_ctype_spl_by_cond_plot_png: - type: File? - outputSource: assign_cell_types/umap_ctype_spl_by_cond_plot_png - label: "Split by condition grouped by cell type UMAP projected PCA of filtered integrated/scaled datasets" - doc: | - Split by condition grouped by cell type UMAP projected PCA of filtered integrated/scaled datasets - PNG format - 'sd:visualPlugins': - - image: - tab: 'Cell Types' - Caption: 'Split by condition grouped by cell type UMAP projected PCA of filtered integrated/scaled datasets' - - umap_ctype_spl_by_cond_plot_pdf: - type: File? - outputSource: assign_cell_types/umap_ctype_spl_by_cond_plot_pdf - label: "Split by condition grouped by cell type UMAP projected PCA of filtered integrated/scaled datasets" - doc: | - Split by condition grouped by cell type UMAP projected PCA of filtered integrated/scaled datasets - PDF format - - expr_avg_per_ctype_plot_png: - type: File? - outputSource: assign_cell_types/expr_avg_per_ctype_plot_png - label: "Scaled average log normalized gene expression per predicted cell type of filtered integrated/scaled datasets" - doc: | - Scaled average log normalized gene expression per predicted cell type of filtered integrated/scaled datasets - PNG format - 'sd:visualPlugins': - - image: - tab: 'Cell Types' - Caption: 'Scaled average log normalized gene expression per predicted cell type of filtered integrated/scaled datasets' - - expr_avg_per_ctype_plot_pdf: - type: File? - outputSource: assign_cell_types/expr_avg_per_ctype_plot_pdf - label: "Scaled average log normalized gene expression per predicted cell type of filtered integrated/scaled datasets" - doc: | - Scaled average log normalized gene expression per predicted cell type of filtered integrated/scaled datasets - PDF format - - expr_per_ctype_cell_plot_png: - type: File? - outputSource: assign_cell_types/expr_per_ctype_cell_plot_png - label: "Log normalized gene expression per cell of clustered filtered integrated/scaled datasets with predicted cell types" - doc: | - Log normalized gene expression per cell of clustered filtered integrated/scaled datasets with predicted cell types - PNG format - 'sd:visualPlugins': - - image: - tab: 'Cell Types' - Caption: 'Log normalized gene expression per cell of clustered filtered integrated/scaled datasets with predicted cell types' - - expr_per_ctype_cell_plot_pdf: - type: File? - outputSource: assign_cell_types/expr_per_ctype_cell_plot_pdf - label: "Log normalized gene expression per cell of clustered filtered integrated/scaled datasets with predicted cell types" - doc: | - Log normalized gene expression per cell of clustered filtered integrated/scaled datasets with predicted cell types - PDF format - - expr_dnst_per_ctype_plot_png: - type: File? - outputSource: assign_cell_types/expr_dnst_per_ctype_plot_png - label: "Log normalized gene expression densities per predicted cell type of filtered integrated/scaled datasets" - doc: | - Log normalized gene expression densities per predicted cell type of filtered integrated/scaled datasets - PNG format - 'sd:visualPlugins': - - image: - tab: 'Cell Types' - Caption: 'Log normalized gene expression densities per predicted cell type of filtered integrated/scaled datasets' - - expr_dnst_per_ctype_plot_pdf: - type: File? - outputSource: assign_cell_types/expr_dnst_per_ctype_plot_pdf - label: "Log normalized gene expression densities per predicted cell type of filtered integrated/scaled datasets" - doc: | - Log normalized gene expression densities per predicted cell type of filtered integrated/scaled datasets - PDF format - - seurat_ctype_data_rds: - type: File - outputSource: assign_cell_types/seurat_ctype_data_rds - label: "Clustered filtered integrated/scaled Seurat data with assigned cell types" - doc: | - Clustered filtered integrated/scaled Seurat data with assigned cell types. - RDS format - - compressed_cellbrowser_config_data: - type: File - outputSource: compress_cellbrowser_config_data/compressed_folder - label: "Compressed directory with UCSC Cellbrowser configuration data" - doc: | - Compressed directory with UCSC Cellbrowser configuration data - - cellbrowser_html_data: - type: Directory - outputSource: assign_cell_types/cellbrowser_html_data - label: "Directory with UCSC Cellbrowser formatted html data" - doc: | - Directory with UCSC Cellbrowser formatted html data - - cellbrowser_html_file: - type: File - outputSource: assign_cell_types/cellbrowser_html_file - label: "Open in UCSC Cell Browser" - doc: | - HTML index file from the directory with UCSC Cellbrowser formatted html data - 'sd:visualPlugins': - - linkList: - tab: 'Overview' - target: "_blank" - - assign_cell_types_stdout_log: - type: File - outputSource: assign_cell_types/stdout_log - label: stdout log generated by 'assign_cell_types' step - doc: | - stdout log generated by 'assign_cell_types' step - - assign_cell_types_stderr_log: - type: File - outputSource: assign_cell_types/stderr_log - label: stderr log generated by 'assign_cell_types' step - doc: | - stderr log generated by 'assign_cell_types' step - - -steps: - - assign_cell_types: - run: ../tools/sc-assign-cell-types.cwl - in: - seurat_data_rds: seurat_data_rds - cell_type_data: cell_type_data - source_column: - source: [resolution, from_aggregated] - valueFrom: $(get_source_column(self[0], self[1])) - target_column: - source: resolution - valueFrom: $(get_target_column(self)) - selected_features: - source: selected_features - valueFrom: $(split_features(self)) - export_pdf_plots: - default: true - threads: threads - out: - - umap_ctype_plot_png - - umap_ctype_plot_pdf - - umap_ctype_spl_by_cond_plot_png - - umap_ctype_spl_by_cond_plot_pdf - - expr_avg_per_ctype_plot_png - - expr_avg_per_ctype_plot_pdf - - expr_per_ctype_cell_plot_png - - expr_per_ctype_cell_plot_pdf - - expr_dnst_per_ctype_plot_png - - expr_dnst_per_ctype_plot_pdf - - seurat_ctype_data_rds - - cellbrowser_config_data - - cellbrowser_html_data - - cellbrowser_html_file - - stdout_log - - stderr_log - - compress_cellbrowser_config_data: - run: ../tools/tar-compress.cwl - in: - folder_to_compress: assign_cell_types/cellbrowser_config_data - out: - - compressed_folder - - -$namespaces: - s: http://schema.org/ - -$schemas: -- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf - -label: "Deprecated. Single-cell Assign Cell Types" -s:name: "Deprecated. Single-cell Assign Cell Types" -s:alternateName: "Assigns cell types to Seurat clusters" - -s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/sc-assign-cell-types.cwl -s:codeRepository: https://github.com/datirium/workflows -s:license: http://www.apache.org/licenses/LICENSE-2.0 - -s:isPartOf: - class: s:CreativeWork - s:name: Common Workflow Language - s:url: http://commonwl.org/ - -s:creator: -- class: s:Organization - s:legalName: "Cincinnati Children's Hospital Medical Center" - s:location: - - class: s:PostalAddress - s:addressCountry: "USA" - s:addressLocality: "Cincinnati" - s:addressRegion: "OH" - s:postalCode: "45229" - s:streetAddress: "3333 Burnet Ave" - s:telephone: "+1(513)636-4200" - s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" - s:department: - - class: s:Organization - s:legalName: "Allergy and Immunology" - s:department: - - class: s:Organization - s:legalName: "Barski Research Lab" - s:member: - - class: s:Person - s:name: Michael Kotliar - s:email: mailto:misha.kotliar@gmail.com - s:sameAs: - - id: http://orcid.org/0000-0002-6486-3898 - - -doc: | - Deprecated. Single-cell Assign Cell Types - ========================================= - - Assigns cell types to Seurat clusters. \ No newline at end of file diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index c6dbc3ea..aafe59ef 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -149,6 +149,10 @@ inputs: symbols: - "1" - "2" + - "3" + - "4" + - "5" + - "6" default: "1" label: "Cores/CPUs" doc: | diff --git a/workflows/sc-atac-coverage.cwl b/workflows/sc-atac-coverage.cwl index 940fff8d..38214a84 100644 --- a/workflows/sc-atac-coverage.cwl +++ b/workflows/sc-atac-coverage.cwl @@ -112,47 +112,25 @@ inputs: 'sd:layout': advanced: true - parallel_memory_limit: - type: - - "null" - - type: enum - symbols: - - "32" - default: "32" - label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" - doc: | - Maximum memory in GB allowed to be shared between the workers - when using multiple --cpus. - Forced to 32 GB - 'sd:layout': - advanced: true - - vector_memory_limit: - type: - - "null" - - type: enum - symbols: - - "64" - default: "64" - label: "Maximum vector memory in GB allowed to be used by R" - doc: | - Maximum vector memory in GB allowed to be used by R. - Forced to 64 GB - 'sd:layout': - advanced: true - threads: type: - "null" - type: enum symbols: - "1" + - "2" + - "3" + - "4" + - "5" + - "6" default: "1" - label: "Number of cores/cpus to use" + label: "Cores/CPUs" doc: | - Number of cores/cpus to use - Forced to 1 - 'sd:layout': + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 1 + "sd:layout": advanced: true @@ -251,11 +229,9 @@ steps: verbose: default: true parallel_memory_limit: - source: parallel_memory_limit - valueFrom: $(parseInt(self)) + default: 32 vector_memory_limit: - source: vector_memory_limit - valueFrom: $(parseInt(self)) + default: 96 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-atac-dbinding.cwl b/workflows/sc-atac-dbinding.cwl index 298e3e24..cd66d062 100644 --- a/workflows/sc-atac-dbinding.cwl +++ b/workflows/sc-atac-dbinding.cwl @@ -248,47 +248,25 @@ inputs: 'sd:layout': advanced: true - parallel_memory_limit: - type: - - "null" - - type: enum - symbols: - - "32" - default: "32" - label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" - doc: | - Maximum memory in GB allowed to be shared between the workers - when using multiple --cpus. - Forced to 32 GB - 'sd:layout': - advanced: true - - vector_memory_limit: - type: - - "null" - - type: enum - symbols: - - "64" - default: "64" - label: "Maximum vector memory in GB allowed to be used by R" - doc: | - Maximum vector memory in GB allowed to be used by R. - Forced to 64 GB - 'sd:layout': - advanced: true - threads: type: - "null" - type: enum symbols: - "1" + - "2" + - "3" + - "4" + - "5" + - "6" default: "1" - label: "Number of cores/cpus to use" + label: "Cores/CPUs" doc: | - Number of cores/cpus to use - Forced to 1 - 'sd:layout': + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 1 + "sd:layout": advanced: true @@ -685,11 +663,9 @@ steps: export_pdf_plots: default: true parallel_memory_limit: - source: parallel_memory_limit - valueFrom: $(parseInt(self)) + default: 32 vector_memory_limit: - source: vector_memory_limit - valueFrom: $(parseInt(self)) + default: 96 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index a80672c2..16393864 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -214,6 +214,8 @@ inputs: - "2" - "3" - "4" + - "5" + - "6" default: "1" label: "Cores/CPUs" doc: | diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index 0f268d34..8e07b5b4 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -182,6 +182,10 @@ inputs: symbols: - "1" - "2" + - "3" + - "4" + - "5" + - "6" default: "1" label: "Cores/CPUs" doc: | diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index 5d75eaba..7aceb235 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -440,6 +440,8 @@ inputs: - "2" - "3" - "4" + - "5" + - "6" default: "1" label: "Cores/CPUs" doc: | diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index df02fc40..1eaa02be 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -134,6 +134,10 @@ inputs: symbols: - "1" - "2" + - "3" + - "4" + - "5" + - "6" default: "1" label: "Cores/CPUs" doc: | diff --git a/workflows/sc-rna-da-cells.cwl b/workflows/sc-rna-da-cells.cwl index 08cfa767..bb5662de 100644 --- a/workflows/sc-rna-da-cells.cwl +++ b/workflows/sc-rna-da-cells.cwl @@ -126,47 +126,25 @@ inputs: 'sd:layout': advanced: true - parallel_memory_limit: - type: - - "null" - - type: enum - symbols: - - "32" - default: "32" - label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" - doc: | - Maximum memory in GB allowed to be shared between the workers - when using multiple --cpus. - Forced to 32 GB - 'sd:layout': - advanced: true - - vector_memory_limit: - type: - - "null" - - type: enum - symbols: - - "64" - default: "64" - label: "Maximum vector memory in GB allowed to be used by R" - doc: | - Maximum vector memory in GB allowed to be used by R. - Forced to 64 GB - 'sd:layout': - advanced: true - threads: type: - "null" - type: enum symbols: - "1" + - "2" + - "3" + - "4" + - "5" + - "6" default: "1" - label: "Number of cores/cpus to use" + label: "Cores/CPUs" doc: | - Number of cores/cpus to use - Forced to 1 - 'sd:layout': + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 1 + "sd:layout": advanced: true @@ -390,11 +368,9 @@ steps: default: true color_theme: color_theme parallel_memory_limit: - source: parallel_memory_limit - valueFrom: $(parseInt(self)) + default: 32 vector_memory_limit: - source: vector_memory_limit - valueFrom: $(parseInt(self)) + default: 96 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-rna-de-pseudobulk.cwl b/workflows/sc-rna-de-pseudobulk.cwl index 22f75502..a51f1252 100644 --- a/workflows/sc-rna-de-pseudobulk.cwl +++ b/workflows/sc-rna-de-pseudobulk.cwl @@ -276,6 +276,10 @@ inputs: symbols: - "1" - "2" + - "3" + - "4" + - "5" + - "6" default: "1" label: "Cores/CPUs" doc: | diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index 96c77dbe..d8d79144 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -197,47 +197,25 @@ inputs: 'sd:layout': advanced: true - parallel_memory_limit: - type: - - "null" - - type: enum - symbols: - - "32" - default: "32" - label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" - doc: | - Maximum memory in GB allowed to be shared between the workers - when using multiple --cpus. - Forced to 32 GB - 'sd:layout': - advanced: true - - vector_memory_limit: - type: - - "null" - - type: enum - symbols: - - "32" - default: "32" - label: "Maximum vector memory in GB allowed to be used by R" - doc: | - Maximum vector memory in GB allowed to be used by R. - Forced to 32 GB - 'sd:layout': - advanced: true - threads: type: - "null" - type: enum symbols: - "1" + - "2" + - "3" + - "4" + - "5" + - "6" default: "1" - label: "Number of cores/cpus to use" + label: "Cores/CPUs" doc: | - Number of cores/cpus to use - Forced to 1 - 'sd:layout': + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 1 + "sd:layout": advanced: true @@ -688,11 +666,9 @@ steps: default: true color_theme: color_theme parallel_memory_limit: - source: parallel_memory_limit - valueFrom: $(parseInt(self)) + default: 32 vector_memory_limit: - source: vector_memory_limit - valueFrom: $(parseInt(self)) + default: 96 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index d7dfc0a9..faebb404 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -285,6 +285,10 @@ inputs: symbols: - "1" - "2" + - "3" + - "4" + - "5" + - "6" default: "1" label: "Cores/CPUs" doc: | diff --git a/workflows/sc-rna-trajectory.cwl b/workflows/sc-rna-trajectory.cwl index f451de05..4cebe253 100644 --- a/workflows/sc-rna-trajectory.cwl +++ b/workflows/sc-rna-trajectory.cwl @@ -148,6 +148,10 @@ inputs: symbols: - "1" - "2" + - "3" + - "4" + - "5" + - "6" default: "1" label: "Cores/CPUs" doc: | diff --git a/workflows/sc-triangulate.cwl b/workflows/sc-triangulate.cwl index 67db96f9..4febb1d8 100644 --- a/workflows/sc-triangulate.cwl +++ b/workflows/sc-triangulate.cwl @@ -102,47 +102,25 @@ inputs: 'sd:layout': advanced: true - parallel_memory_limit: - type: - - "null" - - type: enum - symbols: - - "32" - default: "32" - label: "Maximum memory in GB allowed to be shared between the workers when using multiple CPUs" - doc: | - Maximum memory in GB allowed to be shared between the workers - when using multiple --cpus. - Forced to 32 GB - 'sd:layout': - advanced: true - - vector_memory_limit: - type: - - "null" - - type: enum - symbols: - - "64" - default: "64" - label: "Maximum vector memory in GB allowed to be used by R" - doc: | - Maximum vector memory in GB allowed to be used by R. - Forced to 64 GB - 'sd:layout': - advanced: true - threads: type: - "null" - type: enum symbols: - "1" + - "2" + - "3" + - "4" + - "5" + - "6" default: "1" - label: "Number of cores/cpus to use" + label: "Cores/CPUs" doc: | - Number of cores/cpus to use - Forced to 1 - 'sd:layout': + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 1 + "sd:layout": advanced: true @@ -322,11 +300,9 @@ steps: default: true color_theme: color_theme parallel_memory_limit: - source: parallel_memory_limit - valueFrom: $(parseInt(self)) + default: 32 vector_memory_limit: - source: vector_memory_limit - valueFrom: $(parseInt(self)) + default: 96 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-vdj-profile.cwl b/workflows/sc-vdj-profile.cwl index 12b5e598..84e96dd2 100644 --- a/workflows/sc-vdj-profile.cwl +++ b/workflows/sc-vdj-profile.cwl @@ -166,6 +166,10 @@ inputs: symbols: - "1" - "2" + - "3" + - "4" + - "5" + - "6" default: "1" label: "Cores/CPUs" doc: | diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index f881130a..0a97e5f6 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -182,6 +182,10 @@ inputs: symbols: - "1" - "2" + - "3" + - "4" + - "5" + - "6" default: "1" label: "Cores/CPUs" doc: | diff --git a/workflows/sc_diff_expr.cwl b/workflows/sc_diff_expr.cwl deleted file mode 100644 index 684f5f82..00000000 --- a/workflows/sc_diff_expr.cwl +++ /dev/null @@ -1,421 +0,0 @@ -cwlVersion: v1.0 -class: Workflow - - -requirements: - - class: SubworkflowFeatureRequirement - - class: StepInputExpressionRequirement - - class: MultipleInputFeatureRequirement - - class: InlineJavascriptRequirement - expressionLib: - - var split_features = function(line) { - function get_unique(value, index, self) { - return self.indexOf(value) === index && value != ""; - } - let splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; - return (splitted_line && !!splitted_line.length)?splitted_line:null; - }; - - var parse_splitby = function(line) { - return (line == "dataset")?"new.ident":line; - }; - - var parse_resolution = function(line) { - return "integrated_snn_res."+line; - }; - - -'sd:upstream': - seurat_cluster_sample: - - "seurat-cluster.cwl" - - -inputs: - - alias: - type: string - label: "Experiment short name/Alias" - sd:preview: - position: 1 - - seurat_data_rds: - type: File - label: "Seurat Cluster Experiment" - doc: | - Path to the RDS file to load Seurat object from. - RDS file can be produced by run_seurat.R script. - 'sd:upstreamSource': "seurat_cluster_sample/seurat_clst_data_rds" - 'sd:localLabel': true - - splitby: - type: - - "null" - - type: enum - symbols: - - "condition" - - "dataset" - default: "condition" - label: "Divide cell based on" - doc: | - Column from the Seurat object metadata to split cells into two groups - to run second_cond vs first_cond differential expression analysis. May include - columns from the metadata fields added with conditions_data. - - first_cond: - type: string - label: "First group of cells" - doc: | - Value from the Seurat object metadata column set with splitby to define the - first group of cells or pseudobulk RNA-Seq samples (when using pseudo). - - second_cond: - type: string - label: "Second group of cells" - doc: | - Value from the Seurat object metadata column set with splitby to define the - the second group of cells or pseudobulk RNA-Seq samples (when using pseudo). - - resolution: - type: string - label: "Clustering resolution to subset cells" - doc: | - Clustering resolution to subset cells. Will be used to define a field from - the Seurat object metadata to group cells for subsetting. - - selected_clusters: - type: string - label: "Comma or space separated list of clusters to subset cells" - doc: | - Value(s) from the column set with groupby (inferred from resolution) to - subset cells before running differential expression analysis. - - selected_features: - type: string? - default: null - label: "Comma or space separated list of genes of interest" - doc: | - Genes of interest to label on the generated plots. - Default: 10 genes with the highest and the - lowest log2 fold change expression values. - 'sd:layout': - advanced: true - - excluded_features: - type: string? - default: null - label: "Comma or space separated list of genes to be excluded" - doc: | - Genes to be excluded from the differential expression analysis. - 'sd:layout': - advanced: true - - minimum_logfc: - type: float? - default: 0.25 - label: "Include only those genes that on average have the absolute value of log2 fold change expression difference not lower than this value" - doc: | - Include only those genes that on average have the absolute value of log2 - fold change expression difference not lower than this value. Increasing - minimum_logfc speeds up calculations, but can cause missing weaker signals. - Ignored with pseudo. - 'sd:layout': - advanced: true - - minimum_pct: - type: float? - default: 0.1 - label: "Include only those genes that are detected in not lower than this fraction of cells in either of the two tested groups" - doc: | - Include only those genes that are detected in not lower than this fraction of cells - in either of the two tested groups. Increasing minimum_pct speeds up calculations by not - testing genes that are very infrequently expressed. Ignored with pseudo. - 'sd:layout': - advanced: true - - maximum_pvadj: - type: float? - default: 0.1 - label: "Include only those genes for which adjusted P-val is not bigger that this value" - doc: | - Include only those genes for which adjusted P-val is not bigger that this value. - 'sd:layout': - advanced: true - - test_use: - type: - - "null" - - type: enum - symbols: - - "wilcox" - - "bimod" - - "roc" - - "t" - - "negbinom" - - "poisson" - - "LR" - - "MAST" - - "DESeq2" - default: "wilcox" - label: "Statistical test to use for differential gene expression analysis" - doc: | - Statistical test to use for differential gene expression analysis. - Ignored with pseudo. - 'sd:layout': - advanced: true - - batchby: - type: string? - default: null - label: "Column from the metadata to define the variable that should be modelled as a batch effect" - doc: | - Column from the Seurat object metadata to define the variable that should - be modelled as a batch effect when running differential expression analysis. - Applied only when test_use is one of 'LR', 'negbinom', 'poisson', or 'MAST', - or when using pseudo. May include columns from the metadata fields added - with conditions_data. Values selected from the column set with batchby should - establish 1:1 relation with the 'new.ident' column of the Seurat object loaded - from seurat_data_rds. - 'sd:layout': - advanced: true - - pseudo: - type: boolean? - default: false - label: "Aggregate gene expression of the cells from the same dataset into a pseudobulk RNA-Seq sample" - doc: | - Aggregate gene expression of the cells from the same dataset into a pseudobulk - RNA-Seq sample before running differential expression analysis with DESeq2. - The following parameters will be ignored: test_use, minimum_pct, minimum_logfc. - 'sd:layout': - advanced: true - - lrt: - type: boolean? - default: false - label: "Use LRT instead of the pair-wise Wald test" - doc: | - Use LRT instead of the pair-wise Wald test. Shows any differences across the variable - set with batchby whith the log2 fold changes calculated as the average expression - changes due to criteria set with splitby. Ignored when pseudo or batchby - parameters are not provided. - 'sd:layout': - advanced: true - - threads: - type: int? - default: 2 - label: "Threads number to use" - doc: | - Threads number - 'sd:layout': - advanced: true - - conditions_data: - type: File? - label: "TSV/CSV file to optionally extend metadata" - doc: | - Path to the TSV/CSV file to optionally extend Seurat object metadata. First - column 'library_id' should include all unique values from the 'new.ident' - column of the loaded from seurat_data_rds object metadata. All other columns will - be added to the Seurat object metadata. If any of the provided in this file - columns were already present in the Seurat object metadata, they will be - overwritten. - 'sd:layout': - advanced: true - - -outputs: - - cell_abundance_plot_png: - type: File? - outputSource: sc_diff_expr/cell_abundance_plot_png - label: "Cell abundance" - doc: | - Cell abundance plot split by criteria set in splitby (a.k.a condition) and optionally - subsetted by selected_groups (a.k.a clusters) from the groups defined in groupby. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Gene expression plots' - Caption: 'Cell abundance plot' - - cell_abundance_plot_pdf: - type: File? - outputSource: sc_diff_expr/cell_abundance_plot_pdf - label: "Cell abundance" - doc: | - Cell abundance plot split by criteria set in splitby (a.k.a condition) and optionally - subsetted by selected_groups (a.k.a clusters) from the groups defined in groupby. - PDF format - - aggr_gene_expr_plot_png: - type: File? - outputSource: sc_diff_expr/aggr_gene_expr_plot_png - label: "Log normalized aggregated gene expression" - doc: | - Log normalized aggregated gene expression split by criteria set in splitby - (a.k.a condition). - PNG format - 'sd:visualPlugins': - - image: - tab: 'Gene expression plots' - Caption: 'Log normalized aggregated gene expression' - - aggr_gene_expr_plot_pdf: - type: File? - outputSource: sc_diff_expr/aggr_gene_expr_plot_pdf - label: "Log normalized aggregated gene expression" - doc: | - Log normalized aggregated gene expression split by criteria set in splitby - (a.k.a condition). - PDF format - - diff_expr_genes_plot_png: - type: File? - outputSource: sc_diff_expr/diff_expr_genes_plot_png - label: "Differentially expressed genes" - doc: | - Volcano plot of differentially expressed genes for second_cond vs first_cond cells - or pseudobulk RNA-Seq samples split by criteria set in splitby (a.k.a condition) - and optionally subsetted by selected_clusters from the groups defined in groupby. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Gene expression plots' - Caption: 'Differentially expressed genes' - - diff_expr_genes_plot_pdf: - type: File? - outputSource: sc_diff_expr/diff_expr_genes_plot_pdf - label: "Differentially expressed genes" - doc: | - Volcano plot of differentially expressed genes for second_cond vs first_cond cells - or pseudobulk RNA-Seq samples split by criteria set in splitby (a.k.a condition) - and optionally subsetted by selected_clusters from the groups defined in groupby. - PDF format - - diff_expr_genes: - type: File - outputSource: sc_diff_expr/diff_expr_genes - label: "Differentially expressed genes" - doc: | - Differentially expressed genes for second_cond vs first_cond cells or pseudobulk - RNA-Seq samples split by criteria set in splitby (a.k.a condition) and optionally - subsetted by selected_clusters from the groups defined in groupby. - TSV format - 'sd:visualPlugins': - - syncfusiongrid: - tab: 'Diff expressed genes' - Title: 'Differentially expressed genes' - - sc_diff_expr_stdout_log: - type: File - outputSource: sc_diff_expr/stdout_log - label: stdout log generated by Seurat Differential Expression Analysis - doc: | - stdout log generated by Seurat Differential Expression Analysis - - sc_diff_expr_stderr_log: - type: File - outputSource: sc_diff_expr/stderr_log - label: stderr log generated by Seurat Differential Expression Analysis - doc: | - stderr log generated by Seurat Differential Expression Analysis - - -steps: - - sc_diff_expr: - run: ../tools/sc_diff_expr.cwl - in: - seurat_data_rds: seurat_data_rds - conditions_data: conditions_data - splitby: - source: splitby - valueFrom: $(parse_splitby(self)) - first_cond: first_cond - second_cond: second_cond - batchby: batchby - groupby: - source: resolution - valueFrom: $(parse_resolution(self)) - selected_groups: - source: selected_clusters - valueFrom: $(split_features(self)) - topn_genes_count: - default: 10 - selected_features: - source: selected_features - valueFrom: $(split_features(self)) - excluded_features: - source: excluded_features - valueFrom: $(split_features(self)) - minimum_logfc: minimum_logfc - minimum_pct: minimum_pct - maximum_pvadj: maximum_pvadj - test_use: test_use - pseudo: pseudo - lrt: lrt - export_pdf_plots: - default: true - threads: threads - out: - - cell_abundance_plot_png - - cell_abundance_plot_pdf - - aggr_gene_expr_plot_png - - aggr_gene_expr_plot_pdf - - diff_expr_genes_plot_png - - diff_expr_genes_plot_pdf - - diff_expr_genes - - stdout_log - - stderr_log - - -$namespaces: - s: http://schema.org/ - -$schemas: -- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf - -label: "Deprecated. Single-cell Differential Expression" -s:name: "Deprecated. Single-cell Differential Expression" -s:alternateName: "Runs differential expression analysis for a subset of cells between two selected conditions" - -s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/sc_diff_expr.cwl -s:codeRepository: https://github.com/datirium/workflows -s:license: http://www.apache.org/licenses/LICENSE-2.0 - -s:isPartOf: - class: s:CreativeWork - s:name: Common Workflow Language - s:url: http://commonwl.org/ - -s:creator: -- class: s:Organization - s:legalName: "Cincinnati Children's Hospital Medical Center" - s:location: - - class: s:PostalAddress - s:addressCountry: "USA" - s:addressLocality: "Cincinnati" - s:addressRegion: "OH" - s:postalCode: "45229" - s:streetAddress: "3333 Burnet Ave" - s:telephone: "+1(513)636-4200" - s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" - s:department: - - class: s:Organization - s:legalName: "Allergy and Immunology" - s:department: - - class: s:Organization - s:legalName: "Barski Research Lab" - s:member: - - class: s:Person - s:name: Michael Kotliar - s:email: mailto:misha.kotliar@gmail.com - s:sameAs: - - id: http://orcid.org/0000-0002-6486-3898 - - -doc: | - Deprecated. Single-cell Differential Expression - =============================================== - - Runs differential expression analysis for a subset of cells between two selected conditions. \ No newline at end of file diff --git a/workflows/seurat-cluster.cwl b/workflows/seurat-cluster.cwl deleted file mode 100644 index 435a59ae..00000000 --- a/workflows/seurat-cluster.cwl +++ /dev/null @@ -1,1559 +0,0 @@ -cwlVersion: v1.0 -class: Workflow - - -requirements: - - class: SubworkflowFeatureRequirement - - class: StepInputExpressionRequirement - - class: MultipleInputFeatureRequirement - - class: InlineJavascriptRequirement - expressionLib: - - var split_features = function(line) { - function get_unique(value, index, self) { - return self.indexOf(value) === index && value != ""; - } - let splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; - return (splitted_line && !!splitted_line.length)?splitted_line:null; - }; - - var split_numbers = function(line) { - let splitted_line = line?line.split(/[\s,]+/).map(parseFloat):null; - return (splitted_line && !!splitted_line.length)?splitted_line:null; - }; - - -'sd:upstream': - sc_rnaseq_sample: - - "cellranger-aggr.cwl" - - "single-cell-preprocess-cellranger.cwl" - - -inputs: - - alias: - type: string - label: "Experiment short name/Alias" - sd:preview: - position: 1 - - filtered_feature_bc_matrix_folder: - type: File - label: "scRNA-Seq Cellranger Experiment" - doc: | - Compressed folder with aggregated filtered feature-barcode matrices in MEX format - 'sd:upstreamSource': "sc_rnaseq_sample/filtered_feature_bc_matrix_folder" - 'sd:localLabel': true - - aggregation_metadata: - type: File? - label: "scRNA-Seq Cellranger Experiment" - doc: | - Aggregation metadata in CSV format. - If not provided, we assume that upstream sc_rnaseq_sample - was not from aggregated sample - 'sd:upstreamSource': "sc_rnaseq_sample/aggregation_metadata" - 'sd:localLabel': true - - minimum_cells: - type: int? - default: 5 - label: "Include genes detected in at least this many cells" - doc: | - Include genes detected in at least this many cells - (applied to thoughout all datasets together). - 'sd:layout': - advanced: true - - minimum_features: - type: string? - default: "250" - label: "Include cells where at least this many genes are detected" - doc: | - Include cells where at least this many genes are detected. - If multiple values provided each of them will be applied to - the correspondent dataset. - 'sd:layout': - advanced: true - - maximum_features: - type: string? - default: "5000" - label: "Include cells with the number of genes not bigger than this value" - doc: | - Include cells with the number of genes not bigger than this value. - If multiple values provided each of them will be applied to the - correspondent dataset. - 'sd:layout': - advanced: true - - minimum_umis: - type: string? - default: "500" - label: "Include cells where at least this many UMIs are detected" - doc: | - Include cells where at least this many UMIs are detected. - If multiple values provided each of them will be applied - to the correspondent dataset. - 'sd:layout': - advanced: true - - minimum_novelty_score: - type: string? - default: "0.8" - label: "Include cells with the novelty score (the ratio of genes per cell over UMIs per cell) not lower than this value" - doc: | - Include cells with the novelty score (the ratio of genes per cell over UMIs per cell) - not lower than this value (calculated as log10(genes)/log10(UMIs)). If multiple values - provided each of them will be applied to the correspondent dataset. - 'sd:layout': - advanced: true - - maximum_mito_perc: - type: float? - default: 5 - label: "Include cells with the percentage of transcripts mapped to mitochondrial genes not bigger than this value" - doc: | - Include cells with the percentage of transcripts mapped to mitochondrial genes not bigger than this value. - 'sd:layout': - advanced: true - - mito_pattern: - type: string? - default: "^Mt-" - label: "Pattern to identify mitochondrial genes" - doc: | - Pattern to identify mitochondrial genes. - 'sd:layout': - advanced: true - - high_var_features_count: - type: int? - default: 3000 - label: "Number of highly variable genes to detect (used for dataset integration and dimensional reduction)" - doc: | - Number of highly variable genes to detect (used for dataset integration and dimensional reduction). - 'sd:layout': - advanced: true - - dimensionality: - type: int? - default: 10 - label: "Number of principal components to use in UMAP projection and clustering (from 1 to 50)" - doc: | - Number of principal components to use in UMAP projection and clustering (from 1 to 50). - Use Elbow plot to adjust this parameter. - 'sd:layout': - advanced: true - - umap_spread: - type: float? - default: 1 - label: "Effective scale of embedded points on UMAP. Determines how clustered/clumped the embedded points are." - doc: | - The effective scale of embedded points on UMAP. In combination with mindist - this determines how clustered/clumped the embedded points are. - 'sd:layout': - advanced: true - - umap_mindist: - type: float? - default: 0.3 - label: "Controls how tightly the embedding is allowed compress points together on UMAP. Sensible values are in the range 0.001 to 0.5" - doc: | - Controls how tightly the embedding is allowed compress points together on UMAP. - Larger values ensure embedded points are moreevenly distributed, while smaller - values allow the algorithm to optimise more accurately with regard to local structure. - Sensible values are in the range 0.001 to 0.5. - 'sd:layout': - advanced: true - - umap_nneighbors: - type: int? - default: 30 - label: "Number of neighboring points used in UMAP. Larger values result in loss of detailed local structure." - doc: | - Determines the number of neighboring points used in UMAP. Larger values will result - in more global structure being preserved at the loss of detailed local structure. - In general this parameter should often be in the range 5 to 50. - 'sd:layout': - advanced: true - - umap_metric: - type: - - "null" - - type: enum - symbols: - - "euclidean" - - "manhattan" - - "chebyshev" - - "minkowski" - - "canberra" - - "braycurtis" - - "mahalanobis" - - "wminkowski" - - "seuclidean" - - "cosine" - - "correlation" - - "haversine" - - "hamming" - - "jaccard" - - "dice" - - "russelrao" - - "kulsinski" - - "ll_dirichlet" - - "hellinger" - - "rogerstanimoto" - - "sokalmichener" - - "sokalsneath" - - "yule" - default: "cosine" - label: "The metric to use to compute distances in high dimensional space for UMAP" - doc: | - The metric to use to compute distances in high dimensional space for UMAP. - 'sd:layout': - advanced: true - - umap_method: - type: - - "null" - - type: enum - symbols: - - "uwot" - - "uwot-learn" - - "umap-learn" - default: "uwot" - label: "UMAP implementation to run" - doc: | - UMAP implementation to run. - 'sd:layout': - advanced: true - - cluster_metric: - type: - - "null" - - type: enum - symbols: - - "euclidean" - - "cosine" - - "manhattan" - - "hamming" - default: "euclidean" - label: "Distance metric used by the nearest neighbors algorithm when running clustering" - doc: | - Distance metric used by the nearest neighbors algorithm when running clustering. - 'sd:layout': - advanced: true - - resolution: - type: string? - default: "0.1" - label: "Comma or space separated list of clustering resolutions" - doc: | - Comma or space separated list of clustering resolutions - 'sd:layout': - advanced: true - - minimum_logfc: - type: float? - default: 0.25 - label: "Include only those genes that on average have log fold change difference in expression between every tested pair of clusters not lower than this value" - doc: | - Include only those genes that on average have log fold change difference in - expression between every tested pair of clusters not lower than this value. - 'sd:layout': - advanced: true - - minimum_pct: - type: float? - default: 0.1 - label: "Include only those genes that are detected in not lower than this fraction of cells in either of the two tested clusters" - doc: | - Include only those genes that are detected in not lower than - this fraction of cells in either of the two tested clusters. - 'sd:layout': - advanced: true - - test_use: - type: - - "null" - - type: enum - symbols: - - "wilcox" - - "bimod" - - "roc" - - "t" - - "negbinom" - - "poisson" - - "LR" - - "MAST" - - "DESeq2" - default: "wilcox" - label: "Statistical test to use for gene markers identification" - doc: | - Statistical test to use for gene markers identification. - 'sd:layout': - advanced: true - - threads: - type: int? - default: 6 - label: "Threads number to use" - doc: | - Threads number - 'sd:layout': - advanced: true - - species: - type: - - "null" - - type: enum - symbols: - - "hs" - - "mm" - - "none" - default: "none" - label: "Species for gene name conversion when running cell type prediction" - doc: | - Select species for gene name conversion when running cell type prediction - with Garnett classifier. - If "none" - do not convert gene names - 'sd:layout': - advanced: true - - regress_cellcycle: - type: boolean? - default: false - label: "Regress cell cycle as a confounding source of variation" - doc: | - Regress cell cycle as a confounding source of variation. - 'sd:layout': - advanced: true - - regress_mito_perc: - type: boolean? - default: false - label: "Regress mitochondrial gene expression as a confounding source of variation" - doc: | - Regress mitochondrial gene expression as a confounding source - of variation. - 'sd:layout': - advanced: true - - only_positive_markers: - type: boolean? - default: false - label: "Report only positive gene markers" - doc: | - Report only positive gene markers. - 'sd:layout': - advanced: true - - no_sct: - type: boolean? - default: false - label: "Use LogNormalize instead of SCTransform when integrating datasets" - doc: | - Do not use SCTransform when running datasets integration. Use LogNormalize instead. - 'sd:layout': - advanced: true - - selected_features: - type: string? - default: null - label: "Comma or space separated list of genes of interest" - doc: | - Comma or space separated list of genes of interest. - Default: do not highlight any features - 'sd:layout': - advanced: true - - conditions_data: - type: File? - label: "TSV/CSV file to define datasets conditions with 'library_id' and 'condition' columns. Rows order should correspond to the aggregation metadata." - doc: | - Path to the TSV/CSV file to define datasets grouping. First column - - 'library_id' with the values provided in the same order as in the - correspondent column of the --identity file, second column 'condition'. - If not provided, each dataset is assigned to its own - biological condition - - barcodes_data: - type: File? - label: "Headerless TSV/CSV file with cell barcodes (one barcode per line) to prefilter input data" - doc: | - Path to the headerless TSV/CSV file with selected barcodes - (one per line) to prefilter input feature-barcode matrices. - If not provided, use all cells - 'sd:layout': - advanced: true - - cell_cycle_data: - type: File? - label: "TSV/CSV file with cell cycle data with 'phase' and 'gene_id' columns" - doc: | - TSV/CSV file with cell cycle data. First column - 'phase', second column 'gene_id'. - If not provided, skip cell cycle score assignment - 'sd:layout': - advanced: true - - classifier_rds: - type: File? - label: "Garnett classifier rds file for cell type prediction" - doc: | - Path to the Garnett classifier rds file for cell type prediction. - If not provided, skip cell type prediction - 'sd:layout': - advanced: true - - -outputs: - - raw_cell_count_plot_png: - type: File? - outputSource: seurat_cluster/raw_cell_count_plot_png - label: "Number of cells per dataset (not filtered)" - doc: | - Number of cells per dataset (not filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (not filtered)' - Caption: 'Number of cells per dataset (not filtered)' - - raw_cell_count_plot_pdf: - type: File? - outputSource: seurat_cluster/raw_cell_count_plot_pdf - label: "Number of cells per dataset (not filtered)" - doc: | - Number of cells per dataset (not filtered). - PDF format - - - raw_umi_dnst_spl_by_cond_plot_png: - type: File? - outputSource: seurat_cluster/raw_umi_dnst_spl_by_cond_plot_png - label: "Split by condition UMI density per cell (not filtered)" - doc: | - Split by condition UMI density per cell (not filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (not filtered)' - Caption: 'Split by condition UMI density per cell (not filtered)' - - raw_umi_dnst_spl_by_cond_plot_pdf: - type: File? - outputSource: seurat_cluster/raw_umi_dnst_spl_by_cond_plot_pdf - label: "Split by condition UMI density per cell (not filtered)" - doc: | - Split by condition UMI density per cell (not filtered). - PDF format - - - raw_gene_dnst_spl_by_cond_plot_png: - type: File? - outputSource: seurat_cluster/raw_gene_dnst_spl_by_cond_plot_png - label: "Split by condition gene density per cell (not filtered)" - doc: | - Split by condition gene density per cell (not filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (not filtered)' - Caption: 'Split by condition gene density per cell (not filtered)' - - raw_gene_dnst_spl_by_cond_plot_pdf: - type: File? - outputSource: seurat_cluster/raw_gene_dnst_spl_by_cond_plot_pdf - label: "Split by condition gene density per cell (not filtered)" - doc: | - Split by condition gene density per cell (not filtered). - PDF format - - - raw_gene_umi_corr_spl_by_ident_plot_png: - type: File? - outputSource: seurat_cluster/raw_gene_umi_corr_spl_by_ident_plot_png - label: "Split by identity genes vs UMIs per cell correlation (not filtered)" - doc: | - Split by identity genes vs UMIs per cell correlation (not filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (not filtered)' - Caption: 'Split by identity genes vs UMIs per cell correlation (not filtered)' - - raw_gene_umi_corr_spl_by_ident_plot_pdf: - type: File? - outputSource: seurat_cluster/raw_gene_umi_corr_spl_by_ident_plot_pdf - label: "Split by identity genes vs UMIs per cell correlation (not filtered)" - doc: | - Split by identity genes vs UMIs per cell correlation (not filtered). - PDF format - - - raw_mito_perc_dnst_spl_by_cond_plot_png: - type: File? - outputSource: seurat_cluster/raw_mito_perc_dnst_spl_by_cond_plot_png - label: "Split by condition density of transcripts mapped to mitochondrial genes per cell (not filtered)" - doc: | - Split by condition density of transcripts mapped to mitochondrial genes per cell (not filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (not filtered)' - Caption: 'Split by condition density of transcripts mapped to mitochondrial genes per cell (not filtered)' - - raw_mito_perc_dnst_spl_by_cond_plot_pdf: - type: File? - outputSource: seurat_cluster/raw_mito_perc_dnst_spl_by_cond_plot_pdf - label: "Split by condition density of transcripts mapped to mitochondrial genes per cell (not filtered)" - doc: | - Split by condition density of transcripts mapped to mitochondrial genes per cell (not filtered). - PDF format - - - raw_nvlt_score_dnst_spl_by_cond_plot_png: - type: File? - outputSource: seurat_cluster/raw_nvlt_score_dnst_spl_by_cond_plot_png - label: "Split by condition novelty score density per cell (not filtered)" - doc: | - Split by condition novelty score density per cell (not filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (not filtered)' - Caption: 'Split by condition novelty score density per cell (not filtered)' - - raw_nvlt_score_dnst_spl_by_cond_plot_pdf: - type: File? - outputSource: seurat_cluster/raw_nvlt_score_dnst_spl_by_cond_plot_pdf - label: "Split by condition novelty score density per cell (not filtered)" - doc: | - Split by condition novelty score density per cell (not filtered). - PDF format - - - raw_qc_mtrcs_plot_png: - type: File? - outputSource: seurat_cluster/raw_qc_mtrcs_plot_png - label: "QC metrics densities per cell (not filtered)" - doc: | - QC metrics densities per cell (not filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (not filtered)' - Caption: 'QC metrics densities per cell (not filtered)' - - raw_qc_mtrcs_plot_pdf: - type: File? - outputSource: seurat_cluster/raw_qc_mtrcs_plot_pdf - label: "QC metrics densities per cell (not filtered)" - doc: | - QC metrics densities per cell (not filtered). - PDF format - - - raw_qc_mtrcs_gr_by_cond_plot_png: - type: File? - outputSource: seurat_cluster/raw_qc_mtrcs_gr_by_cond_plot_png - label: "Grouped by condition QC metrics densities per cell (not filtered)" - doc: | - Grouped by condition QC metrics densities per cell (not filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (not filtered)' - Caption: 'Grouped by condition QC metrics densities per cell (not filtered)' - - raw_qc_mtrcs_gr_by_cond_plot_pdf: - type: File? - outputSource: seurat_cluster/raw_qc_mtrcs_gr_by_cond_plot_pdf - label: "Grouped by condition QC metrics densities per cell (not filtered)" - doc: | - Grouped by condition QC metrics densities per cell (not filtered). - PDF format - - - fltr_cell_count_plot_png: - type: File? - outputSource: seurat_cluster/fltr_cell_count_plot_png - label: "Number of cells per dataset (filtered)" - doc: | - Number of cells per dataset (filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (filtered)' - Caption: 'Number of cells per dataset (filtered)' - - fltr_cell_count_plot_pdf: - type: File? - outputSource: seurat_cluster/fltr_cell_count_plot_pdf - label: "Number of cells per dataset (filtered)" - doc: | - Number of cells per dataset (filtered). - PDF format - - - fltr_umi_dnst_spl_by_cond_plot_png: - type: File? - outputSource: seurat_cluster/fltr_umi_dnst_spl_by_cond_plot_png - label: "Split by condition UMI density per cell (filtered)" - doc: | - Split by condition UMI density per cell (filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (filtered)' - Caption: 'Split by condition UMI density per cell (filtered)' - - fltr_umi_dnst_spl_by_cond_plot_pdf: - type: File? - outputSource: seurat_cluster/fltr_umi_dnst_spl_by_cond_plot_pdf - label: "Split by condition UMI density per cell (filtered)" - doc: | - Split by condition UMI density per cell (filtered). - PDF format - - - fltr_gene_dnst_spl_by_cond_plot_png: - type: File? - outputSource: seurat_cluster/fltr_gene_dnst_spl_by_cond_plot_png - label: "Split by condition gene density per cell (filtered)" - doc: | - Split by condition gene density per cell (filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (filtered)' - Caption: 'Split by condition gene density per cell (filtered)' - - fltr_gene_dnst_spl_by_cond_plot_pdf: - type: File? - outputSource: seurat_cluster/fltr_gene_dnst_spl_by_cond_plot_pdf - label: "Split by condition gene density per cell (filtered)" - doc: | - Split by condition gene density per cell (filtered). - PDF format - - - fltr_gene_umi_corr_spl_by_ident_plot_png: - type: File? - outputSource: seurat_cluster/fltr_gene_umi_corr_spl_by_ident_plot_png - label: "Split by identity genes vs UMIs per cell correlation (filtered)" - doc: | - Split by identity genes vs UMIs per cell correlation (filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (filtered)' - Caption: 'Split by identity genes vs UMIs per cell correlation (filtered)' - - fltr_gene_umi_corr_spl_by_ident_plot_pdf: - type: File? - outputSource: seurat_cluster/fltr_gene_umi_corr_spl_by_ident_plot_pdf - label: "Split by identity genes vs UMIs per cell correlation (filtered)" - doc: | - Split by identity genes vs UMIs per cell correlation (filtered). - PDF format - - - fltr_mito_perc_dnst_spl_by_cond_plot_png: - type: File? - outputSource: seurat_cluster/fltr_mito_perc_dnst_spl_by_cond_plot_png - label: "Split by condition density of transcripts mapped to mitochondrial genes per cell (filtered)" - doc: | - Split by condition density of transcripts mapped to mitochondrial genes per cell (filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (filtered)' - Caption: 'Split by condition density of transcripts mapped to mitochondrial genes per cell (filtered)' - - fltr_mito_perc_dnst_spl_by_cond_plot_pdf: - type: File? - outputSource: seurat_cluster/fltr_mito_perc_dnst_spl_by_cond_plot_pdf - label: "Split by condition density of transcripts mapped to mitochondrial genes per cell (filtered)" - doc: | - Split by condition density of transcripts mapped to mitochondrial genes per cell (filtered). - PDF format - - - fltr_nvlt_score_dnst_spl_by_cond_plot_png: - type: File? - outputSource: seurat_cluster/fltr_nvlt_score_dnst_spl_by_cond_plot_png - label: "Split by condition novelty score density per cell (filtered)" - doc: | - Split by condition novelty score density per cell (filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (filtered)' - Caption: 'Split by condition novelty score density per cell (filtered)' - - fltr_nvlt_score_dnst_spl_by_cond_plot_pdf: - type: File? - outputSource: seurat_cluster/fltr_nvlt_score_dnst_spl_by_cond_plot_pdf - label: "Split by condition novelty score density per cell (filtered)" - doc: | - Split by condition novelty score density per cell (filtered). - PDF format - - - fltr_qc_mtrcs_plot_png: - type: File? - outputSource: seurat_cluster/fltr_qc_mtrcs_plot_png - label: "QC metrics densities per cell (filtered)" - doc: | - QC metrics densities per cell (filtered). - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (filtered)' - Caption: 'QC metrics densities per cell (filtered)' - - fltr_qc_mtrcs_plot_pdf: - type: File? - outputSource: seurat_cluster/fltr_qc_mtrcs_plot_pdf - label: "QC metrics densities per cell (filtered)" - doc: | - QC metrics densities per cell (filtered). - PDF format - - - fltr_qc_mtrcs_gr_by_cond_plot_png: - type: File? - outputSource: seurat_cluster/fltr_qc_mtrcs_gr_by_cond_plot_png - label: "Grouped by condition QC metrics densities per cell (filtered)" - doc: | - Grouped by condition QC metrics densities per cell (filtered). - PDF format - 'sd:visualPlugins': - - image: - tab: 'QC (filtered)' - Caption: 'Grouped by condition QC metrics densities per cell (filtered)' - - fltr_qc_mtrcs_gr_by_cond_plot_pdf: - type: File? - outputSource: seurat_cluster/fltr_qc_mtrcs_gr_by_cond_plot_pdf - label: "Grouped by condition QC metrics densities per cell (filtered)" - doc: | - Grouped by condition QC metrics densities per cell (filtered). - PDF format - - - fltr_pca_spl_by_ph_plot_png: - type: File? - outputSource: seurat_cluster/fltr_pca_spl_by_ph_plot_png - label: "Split by cell cycle phase PCA of filtered unintegrated/scaled datasets" - doc: | - Split by cell cycle phase PCA of filtered unintegrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (filtered)' - Caption: 'Split by cell cycle phase PCA of filtered unintegrated/scaled datasets' - - fltr_pca_spl_by_ph_plot_pdf: - type: File? - outputSource: seurat_cluster/fltr_pca_spl_by_ph_plot_pdf - label: "Split by cell cycle phase PCA of filtered unintegrated/scaled datasets" - doc: | - Split by cell cycle phase PCA of filtered unintegrated/scaled datasets. - PDF format - - - fltr_pca_spl_by_mito_perc_plot_png: - type: File? - outputSource: seurat_cluster/fltr_pca_spl_by_mito_perc_plot_png - label: "Split by level of transcripts mapped to mitochondrial genes PCA of filtered unintegrated/scaled datasets" - doc: | - Split by level of transcripts mapped to mitochondrial genes PCA of filtered unintegrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (filtered)' - Caption: 'Split by level of transcripts mapped to mitochondrial genes PCA of filtered unintegrated/scaled datasets' - - fltr_pca_spl_by_mito_perc_plot_pdf: - type: File? - outputSource: seurat_cluster/fltr_pca_spl_by_mito_perc_plot_pdf - label: "Split by level of transcripts mapped to mitochondrial genes PCA of filtered unintegrated/scaled datasets" - doc: | - Split by level of transcripts mapped to mitochondrial genes PCA of filtered unintegrated/scaled datasets. - PDF format - - - fltr_umap_spl_by_idnt_plot_png: - type: File? - outputSource: seurat_cluster/fltr_umap_spl_by_idnt_plot_png - label: "Split by identity UMAP projected PCA of filtered unintegrated/scaled datasets" - doc: | - Split by identity UMAP projected PCA of filtered unintegrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (filtered)' - Caption: 'Split by identity UMAP projected PCA of filtered unintegrated/scaled datasets' - - fltr_umap_spl_by_idnt_plot_pdf: - type: File? - outputSource: seurat_cluster/fltr_umap_spl_by_idnt_plot_pdf - label: "Split by identity UMAP projected PCA of filtered unintegrated/scaled datasets" - doc: | - Split by identity UMAP projected PCA of filtered unintegrated/scaled datasets. - PDF format - - - ntgr_elbow_plot_png: - type: File? - outputSource: seurat_cluster/ntgr_elbow_plot_png - label: "Elbow plot from PCA of filtered integrated/scaled datasets" - doc: | - Elbow plot from PCA of filtered integrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Dimensionality evaluation' - Caption: 'Elbow plot from PCA of filtered integrated/scaled datasets' - - ntgr_elbow_plot_pdf: - type: File? - outputSource: seurat_cluster/ntgr_elbow_plot_pdf - label: "Elbow plot from PCA of filtered integrated/scaled datasets" - doc: | - Elbow plot from PCA of filtered integrated/scaled datasets. - PDF format - - - ntgr_pca_plot_png: - type: File? - outputSource: seurat_cluster/ntgr_pca_plot_png - label: "PCA of filtered integrated/scaled datasets" - doc: | - PCA of filtered integrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Dimensionality evaluation' - Caption: 'PCA of filtered integrated/scaled datasets' - - ntgr_pca_plot_pdf: - type: File? - outputSource: seurat_cluster/ntgr_pca_plot_pdf - label: "PCA of filtered integrated/scaled datasets" - doc: | - PCA of filtered integrated/scaled datasets. - PDF format - - - ntgr_pca_heatmap_png: - type: File? - outputSource: seurat_cluster/ntgr_pca_heatmap_png - label: "Genes per cells expression heatmap sorted by their PC scores from PCA of filtered integrated/scaled datasets" - doc: | - Genes per cells expression heatmap sorted by their PC scores from PCA of filtered integrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Dimensionality evaluation' - Caption: 'Genes per cells expression heatmap sorted by their PC scores from PCA of filtered integrated/scaled datasets' - - ntgr_pca_heatmap_pdf: - type: File? - outputSource: seurat_cluster/ntgr_pca_heatmap_pdf - label: "Genes per cells expression heatmap sorted by their PC scores from PCA of filtered integrated/scaled datasets" - doc: | - Genes per cells expression heatmap sorted by their PC scores from PCA of filtered integrated/scaled datasets. - PDF format - - - ntgr_pca_loadings_plot_png: - type: File? - outputSource: seurat_cluster/ntgr_pca_loadings_plot_png - label: "PC scores of the most variant genes from PCA of filtered integrated/scaled datasets" - doc: | - PC scores of the most variant genes from PCA of filtered integrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Dimensionality evaluation' - Caption: 'PC scores of the most variant genes from PCA of filtered integrated/scaled datasets' - - ntgr_pca_loadings_plot_pdf: - type: File? - outputSource: seurat_cluster/ntgr_pca_loadings_plot_pdf - label: "PC scores of the most variant genes from PCA of filtered integrated/scaled datasets" - doc: | - PC scores of the most variant genes from PCA of filtered integrated/scaled datasets. - PDF format - - - ntgr_umap_spl_by_idnt_plot_png: - type: File? - outputSource: seurat_cluster/ntgr_umap_spl_by_idnt_plot_png - label: "Split by identity UMAP projected PCA of filtered integrated/scaled datasets" - doc: | - Split by identity UMAP projected PCA of filtered integrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (integrated/scaled)' - Caption: 'Split by identity UMAP projected PCA of filtered integrated/scaled datasets' - - ntgr_umap_spl_by_idnt_plot_pdf: - type: File? - outputSource: seurat_cluster/ntgr_umap_spl_by_idnt_plot_pdf - label: "Split by identity UMAP projected PCA of filtered integrated/scaled datasets" - doc: | - Split by identity UMAP projected PCA of filtered integrated/scaled datasets. - PDF format - - - clst_umap_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/clst_umap_res_plot_png - label: "Clustered UMAP projected PCA of filtered integrated/scaled datasets" - doc: | - Clustered UMAP projected PCA of filtered integrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Clustering' - Caption: 'Clustered UMAP projected PCA of filtered integrated/scaled datasets' - - clst_umap_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/clst_umap_res_plot_pdf - label: "Clustered UMAP projected PCA of filtered integrated/scaled datasets" - doc: | - Clustered UMAP projected PCA of filtered integrated/scaled datasets. - PDF format - - - clst_umap_spl_by_cond_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/clst_umap_spl_by_cond_res_plot_png - label: "Split by condition clustered UMAP projected PCA of filtered integrated/scaled datasets" - doc: | - Split by condition clustered UMAP projected PCA of filtered integrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Clustering' - Caption: 'Split by condition clustered UMAP projected PCA of filtered integrated/scaled datasets' - - clst_umap_spl_by_cond_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/clst_umap_spl_by_cond_res_plot_pdf - label: "Split by condition clustered UMAP projected PCA of filtered integrated/scaled datasets" - doc: | - Split by condition clustered UMAP projected PCA of filtered integrated/scaled datasets. - PDF format - - - clst_umap_ctype_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/clst_umap_ctype_res_plot_png - label: "Grouped by predicted cell types UMAP projected PCA of filtered integrated/scaled datasets" - doc: | - Grouped by predicted cell types UMAP projected PCA of filtered integrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Clustering' - Caption: 'Grouped by predicted cell types UMAP projected PCA of filtered integrated/scaled datasets' - - clst_umap_ctype_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/clst_umap_ctype_res_plot_pdf - label: "Grouped by predicted cell types UMAP projected PCA of filtered integrated/scaled datasets" - doc: | - Grouped by predicted cell types UMAP projected PCA of filtered integrated/scaled datasets. - PDF format - - - clst_umap_spl_by_ph_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/clst_umap_spl_by_ph_res_plot_png - label: "Split by cell cycle phase clustered UMAP projected PCA of filtered integrated/scaled datasets" - doc: | - Split by cell cycle phase clustered UMAP projected PCA of filtered integrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (integrated/scaled)' - Caption: 'Split by cell cycle phase clustered UMAP projected PCA of filtered integrated/scaled datasets' - - clst_umap_spl_by_ph_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/clst_umap_spl_by_ph_res_plot_pdf - label: "Split by cell cycle phase clustered UMAP projected PCA of filtered integrated/scaled datasets" - doc: | - Split by cell cycle phase clustered UMAP projected PCA of filtered integrated/scaled datasets. - PDF format - - - clst_qc_mtrcs_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/clst_qc_mtrcs_res_plot_png - label: "QC metrics for clustered UMAP projected PCA of filtered integrated/scaled datasets" - doc: | - QC metrics for clustered UMAP projected PCA of filtered integrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'QC (integrated/scaled)' - Caption: 'QC metrics for clustered UMAP projected PCA of filtered integrated/scaled datasets' - - clst_qc_mtrcs_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/clst_qc_mtrcs_res_plot_pdf - label: "QC metrics for clustered UMAP projected PCA of filtered integrated/scaled datasets" - doc: | - QC metrics for clustered UMAP projected PCA of filtered integrated/scaled datasets. - PDF format - - - expr_avg_per_clst_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/expr_avg_per_clst_res_plot_png - label: "Scaled average log normalized gene expression per cluster of filtered integrated/scaled datasets" - doc: | - Scaled average log normalized gene expression per cluster of filtered integrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Gene expression' - Caption: 'Scaled average log normalized gene expression per cluster of filtered integrated/scaled datasets' - - expr_avg_per_clst_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/expr_avg_per_clst_res_plot_pdf - label: "Scaled average log normalized gene expression per cluster of filtered integrated/scaled datasets" - doc: | - Scaled average log normalized gene expression per cluster of filtered integrated/scaled datasets. - PDF format - - - expr_per_clst_cell_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/expr_per_clst_cell_res_plot_png - label: "Log normalized gene expression per cell of clustered filtered integrated/scaled datasets" - doc: | - Log normalized gene expression per cell of clustered filtered integrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression per cell of clustered filtered integrated/scaled datasets' - - expr_per_clst_cell_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/expr_per_clst_cell_res_plot_pdf - label: "Log normalized gene expression per cell of clustered filtered integrated/scaled datasets" - doc: | - Log normalized gene expression per cell of clustered filtered integrated/scaled datasets. - PDF format - - - expr_clst_heatmap_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/expr_clst_heatmap_res_plot_png - label: "Log normalized gene expression heatmap of clustered filtered integrated/scaled datasets" - doc: | - Log normalized gene expression heatmap of clustered filtered integrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression heatmap of clustered filtered integrated/scaled datasets' - - expr_clst_heatmap_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/expr_clst_heatmap_res_plot_pdf - label: "Log normalized gene expression heatmap of clustered filtered integrated/scaled datasets" - doc: | - Log normalized gene expression heatmap of clustered filtered integrated/scaled datasets. - PDF format - - - expr_dnst_per_clst_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/expr_dnst_per_clst_res_plot_png - label: "Log normalized gene expression densities per cluster of filtered integrated/scaled datasets" - doc: | - Log normalized gene expression densities per cluster of filtered integrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression densities per cluster of filtered integrated/scaled datasets' - - expr_dnst_per_clst_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/expr_dnst_per_clst_res_plot_pdf - label: "Log normalized gene expression densities per cluster of filtered integrated/scaled datasets" - doc: | - Log normalized gene expression densities per cluster of filtered integrated/scaled datasets. - PDF format - - - expr_avg_per_ctype_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/expr_avg_per_ctype_res_plot_png - label: "Scaled average log normalized gene expression per predicted cell type of filtered integrated/scaled datasets" - doc: | - Scaled average log normalized gene expression per predicted cell type of filtered integrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Gene expression' - Caption: 'Scaled average log normalized gene expression per predicted cell type of filtered integrated/scaled datasets' - - expr_avg_per_ctype_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/expr_avg_per_ctype_res_plot_pdf - label: "Scaled average log normalized gene expression per predicted cell type of filtered integrated/scaled datasets" - doc: | - Scaled average log normalized gene expression per predicted cell type of filtered integrated/scaled datasets. - PDF format - - - expr_per_ctype_cell_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/expr_per_ctype_cell_res_plot_png - label: "Log normalized gene expression per cell of clustered filtered integrated/scaled datasets with predicted cell types" - doc: | - Log normalized gene expression per cell of clustered filtered/scaled integrated datasets with predicted cell types. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression per cell of clustered filtered/scaled integrated datasets with predicted cell types' - - expr_per_ctype_cell_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/expr_per_ctype_cell_res_plot_pdf - label: "Log normalized gene expression per cell of clustered filtered integrated/scaled datasets with predicted cell types" - doc: | - Log normalized gene expression per cell of clustered filtered integrated/scaled datasets with predicted cell types. - PDF format - - - expr_ctype_heatmap_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/expr_ctype_heatmap_res_plot_png - label: "Log normalized gene expression heatmap of clustered filtered integrated/scaled datasets with predicted cell types" - doc: | - Log normalized gene expression heatmap of clustered filtered integrated/scaled datasets with predicted cell types. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression heatmap of clustered filtered integrated/scaled datasets with predicted cell types' - - expr_ctype_heatmap_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/expr_ctype_heatmap_res_plot_pdf - label: "Log normalized gene expression heatmap of clustered filtered integrated/scaled datasets with predicted cell types" - doc: | - Log normalized gene expression heatmap of clustered filtered integrated/scaled datasets with predicted cell types. - PDF format - - - expr_dnst_per_ctype_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/expr_dnst_per_ctype_res_plot_png - label: "Log normalized gene expression densities per predicted cell type of filtered integrated/scaled datasets" - doc: | - Log normalized gene expression densities per predicted cell type of filtered integrated/scaled datasets. - PNG format - 'sd:visualPlugins': - - image: - tab: 'Gene expression' - Caption: 'Log normalized gene expression densities per predicted cell type of filtered integrated/scaled datasets' - - expr_dnst_per_ctype_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputSource: seurat_cluster/expr_dnst_per_ctype_res_plot_pdf - label: "Log normalized gene expression densities per predicted cell type of filtered integrated/scaled datasets" - doc: | - Log normalized gene expression densities per predicted cell type of filtered integrated/scaled datasets. - PDF format - - - seurat_clst_data_rds: - type: File - outputSource: seurat_cluster/seurat_clst_data_rds - label: "Clustered filtered integrated/scaled Seurat data" - doc: | - Clustered filtered integrated Seurat data. - RDS format - - - clst_pttv_gene_markers: - type: File - outputSource: seurat_cluster/clst_pttv_gene_markers - label: "Putative gene markers file for all clusters and all resolutions" - doc: | - Putative gene markers file for all clusters and all resolutions. - TSV format - 'sd:visualPlugins': - - syncfusiongrid: - tab: 'Putative gene markers' - Title: 'Putative gene markers' - - - clst_csrvd_gene_markers: - type: File - outputSource: seurat_cluster/clst_csrvd_gene_markers - label: "Conserved gene markers file for all clusters and all resolutions" - doc: | - Conserved gene markers file for all clusters and all resolutions. - TSV format - 'sd:visualPlugins': - - syncfusiongrid: - tab: 'Conserved gene markers' - Title: 'Conserved gene markers' - - - compressed_cellbrowser_config_data: - type: File - outputSource: compress_cellbrowser_config_data/compressed_folder - label: "Compressed directory with UCSC Cellbrowser configuration data" - doc: | - Compressed directory with UCSC Cellbrowser configuration data - - cellbrowser_html_data: - type: Directory - outputSource: seurat_cluster/cellbrowser_html_data - label: "Directory with UCSC Cellbrowser formatted html data" - doc: | - Directory with UCSC Cellbrowser formatted html data - - cellbrowser_html_file: - type: File - outputSource: seurat_cluster/cellbrowser_html_file - label: "Open in UCSC Cell Browser" - doc: | - HTML index file from the directory with UCSC Cellbrowser formatted html data - 'sd:visualPlugins': - - linkList: - tab: 'Overview' - target: "_blank" - - - seurat_cluster_stdout_log: - type: File - outputSource: seurat_cluster/stdout_log - label: stdout log generated by Seurat - doc: | - stdout log generated by Seurat - - seurat_cluster_stderr_log: - type: File - outputSource: seurat_cluster/stderr_log - label: stderr log generated by Seurat - doc: | - stderr log generated by Seurat - - -steps: - - uncompress_feature_bc_matrices: - in: - compressed: filtered_feature_bc_matrix_folder - out: - - uncompressed - run: - cwlVersion: v1.0 - class: CommandLineTool - hints: - - class: DockerRequirement - dockerPull: biowardrobe2/scidap:v0.0.3 - inputs: - compressed: - type: File - inputBinding: - position: 1 - outputs: - uncompressed: - type: Directory - outputBinding: - glob: "*" - baseCommand: ["tar", "xzf"] - - seurat_cluster: - run: ../tools/seurat-cluster.cwl - in: - feature_bc_matrices_folder: uncompress_feature_bc_matrices/uncompressed - aggregation_metadata: aggregation_metadata - cell_cycle_data: cell_cycle_data - conditions_data: conditions_data - classifier_rds: classifier_rds - species: species - barcodes_data: barcodes_data - minimum_cells: minimum_cells - minimum_features: - source: minimum_features - valueFrom: $(split_numbers(self)) - maximum_features: - source: maximum_features - valueFrom: $(split_numbers(self)) - selected_features: - source: selected_features - valueFrom: $(split_features(self)) - minimum_umis: - source: minimum_umis - valueFrom: $(split_numbers(self)) - minimum_novelty_score: - source: minimum_novelty_score - valueFrom: $(split_numbers(self)) - maximum_mito_perc: maximum_mito_perc - mito_pattern: mito_pattern - regress_cellcycle: regress_cellcycle - regress_mito_perc: regress_mito_perc - high_var_features_count: high_var_features_count - dimensionality: dimensionality - umap_spread: umap_spread - umap_mindist: umap_mindist - umap_nneighbors: umap_nneighbors - umap_metric: umap_metric - umap_method: umap_method - no_sct: no_sct - cluster_metric: cluster_metric - resolution: - source: resolution - valueFrom: $(split_numbers(self)) - minimum_logfc: minimum_logfc - minimum_pct: minimum_pct - only_positive_markers: only_positive_markers - test_use: test_use - export_pdf_plots: - default: true - export_rds_data: - default: true - threads: threads - out: - - raw_cell_count_plot_png - - raw_cell_count_plot_pdf - - raw_umi_dnst_spl_by_cond_plot_png - - raw_umi_dnst_spl_by_cond_plot_pdf - - raw_gene_dnst_spl_by_cond_plot_png - - raw_gene_dnst_spl_by_cond_plot_pdf - - raw_gene_umi_corr_spl_by_ident_plot_png - - raw_gene_umi_corr_spl_by_ident_plot_pdf - - raw_mito_perc_dnst_spl_by_cond_plot_png - - raw_mito_perc_dnst_spl_by_cond_plot_pdf - - raw_nvlt_score_dnst_spl_by_cond_plot_png - - raw_nvlt_score_dnst_spl_by_cond_plot_pdf - - raw_qc_mtrcs_plot_png - - raw_qc_mtrcs_plot_pdf - - raw_qc_mtrcs_gr_by_cond_plot_png - - raw_qc_mtrcs_gr_by_cond_plot_pdf - - fltr_cell_count_plot_png - - fltr_cell_count_plot_pdf - - fltr_umi_dnst_spl_by_cond_plot_png - - fltr_umi_dnst_spl_by_cond_plot_pdf - - fltr_gene_dnst_spl_by_cond_plot_png - - fltr_gene_dnst_spl_by_cond_plot_pdf - - fltr_gene_umi_corr_spl_by_ident_plot_png - - fltr_gene_umi_corr_spl_by_ident_plot_pdf - - fltr_mito_perc_dnst_spl_by_cond_plot_png - - fltr_mito_perc_dnst_spl_by_cond_plot_pdf - - fltr_nvlt_score_dnst_spl_by_cond_plot_png - - fltr_nvlt_score_dnst_spl_by_cond_plot_pdf - - fltr_qc_mtrcs_plot_png - - fltr_qc_mtrcs_plot_pdf - - fltr_qc_mtrcs_gr_by_cond_plot_png - - fltr_qc_mtrcs_gr_by_cond_plot_pdf - - fltr_pca_spl_by_ph_plot_png - - fltr_pca_spl_by_ph_plot_pdf - - fltr_pca_spl_by_mito_perc_plot_png - - fltr_pca_spl_by_mito_perc_plot_pdf - - fltr_umap_spl_by_idnt_plot_png - - fltr_umap_spl_by_idnt_plot_pdf - - ntgr_elbow_plot_png - - ntgr_elbow_plot_pdf - - ntgr_pca_plot_png - - ntgr_pca_plot_pdf - - ntgr_pca_heatmap_png - - ntgr_pca_heatmap_pdf - - ntgr_pca_loadings_plot_png - - ntgr_pca_loadings_plot_pdf - - ntgr_umap_spl_by_idnt_plot_png - - ntgr_umap_spl_by_idnt_plot_pdf - - clst_umap_res_plot_png - - clst_umap_res_plot_pdf - - clst_umap_spl_by_cond_res_plot_png - - clst_umap_spl_by_cond_res_plot_pdf - - clst_umap_ctype_res_plot_png - - clst_umap_ctype_res_plot_pdf - - clst_umap_spl_by_ph_res_plot_png - - clst_umap_spl_by_ph_res_plot_pdf - - clst_qc_mtrcs_res_plot_png - - clst_qc_mtrcs_res_plot_pdf - - clst_pttv_gene_markers - - clst_csrvd_gene_markers - - expr_avg_per_clst_res_plot_png - - expr_avg_per_clst_res_plot_pdf - - expr_per_clst_cell_res_plot_png - - expr_per_clst_cell_res_plot_pdf - - expr_clst_heatmap_res_plot_png - - expr_clst_heatmap_res_plot_pdf - - expr_dnst_per_clst_res_plot_png - - expr_dnst_per_clst_res_plot_pdf - - expr_avg_per_ctype_res_plot_png - - expr_avg_per_ctype_res_plot_pdf - - expr_per_ctype_cell_res_plot_png - - expr_per_ctype_cell_res_plot_pdf - - expr_ctype_heatmap_res_plot_png - - expr_ctype_heatmap_res_plot_pdf - - expr_dnst_per_ctype_res_plot_png - - expr_dnst_per_ctype_res_plot_pdf - - seurat_clst_data_rds - - cellbrowser_config_data - - cellbrowser_html_data - - cellbrowser_html_file - - stdout_log - - stderr_log - - compress_cellbrowser_config_data: - run: ../tools/tar-compress.cwl - in: - folder_to_compress: seurat_cluster/cellbrowser_config_data - out: - - compressed_folder - - -$namespaces: - s: http://schema.org/ - -$schemas: -- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf - -s:name: "Deprecated. Seurat Cluster" -label: "Deprecated. Seurat Cluster" -s:alternateName: "Runs filtering, integration, and clustering analyses for Cell Ranger Count Gene Expression or Cell Ranger Aggregate experiments" - -s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/seurat-cluster.cwl -s:codeRepository: https://github.com/datirium/workflows -s:license: http://www.apache.org/licenses/LICENSE-2.0 - -s:isPartOf: - class: s:CreativeWork - s:name: Common Workflow Language - s:url: http://commonwl.org/ - -s:creator: -- class: s:Organization - s:legalName: "Cincinnati Children's Hospital Medical Center" - s:location: - - class: s:PostalAddress - s:addressCountry: "USA" - s:addressLocality: "Cincinnati" - s:addressRegion: "OH" - s:postalCode: "45229" - s:streetAddress: "3333 Burnet Ave" - s:telephone: "+1(513)636-4200" - s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" - s:department: - - class: s:Organization - s:legalName: "Allergy and Immunology" - s:department: - - class: s:Organization - s:legalName: "Barski Research Lab" - s:member: - - class: s:Person - s:name: Michael Kotliar - s:email: mailto:misha.kotliar@gmail.com - s:sameAs: - - id: http://orcid.org/0000-0002-6486-3898 - - -doc: | - Deprecated. Seurat Cluster - ========================== - - Runs filtering, integration, and clustering analyses for Cell Ranger - Count Gene Expression or Cell Ranger Aggregate experiments. \ No newline at end of file From 99df46125258d04f270575b1d2b53e9fa406df8e Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 11 Dec 2023 14:31:22 -0500 Subject: [PATCH 093/162] Update inputs description for deseq because docker image was updated --- tools/deseq-multi-factor.cwl | 16 ++++++++-------- workflows/deseq-multi-factor.cwl | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tools/deseq-multi-factor.cwl b/tools/deseq-multi-factor.cwl index 2cb64bb9..f17c704f 100644 --- a/tools/deseq-multi-factor.cwl +++ b/tools/deseq-multi-factor.cwl @@ -71,10 +71,10 @@ inputs: Contrast to be be applied for the output, formatted as a mathematical formula of values from the --metadata table. If not provided, all possible combinations of values from - the metadata columns present in the --design but not in the - --reduced formula will be used (results will be merged giving - the priority to significantly differentially expressed genes - with higher absolute log2FoldChange values). + the metadata columns present in the --design will be used + (results will be merged giving the priority to significantly + differentially expressed genes with higher absolute + log2FoldChange values). base: type: @@ -422,10 +422,10 @@ s:about: | --contrast CONTRAST Contrast to be be applied for the output, formatted as a mathematical formula of values from the --metadata table. If not provided, all possible combinations of values from - the metadata columns present in the --design but not in the - --reduced formula will be used (results will be merged giving - the priority to significantly differentially expressed genes - with higher absolute log2FoldChange values). + the metadata columns present in the --design will be used + (results will be merged giving the priority to significantly + differentially expressed genes with higher absolute + log2FoldChange values). --base [BASE ...] Value(s) from each metadata file column(s) to be set as the base level(s). Number and order of provided values should correspond the order of columns in diff --git a/workflows/deseq-multi-factor.cwl b/workflows/deseq-multi-factor.cwl index 0bfb7271..dc952991 100644 --- a/workflows/deseq-multi-factor.cwl +++ b/workflows/deseq-multi-factor.cwl @@ -111,10 +111,10 @@ inputs: Contrast to be be applied for the output, formatted as a mathematical formula of values from the --metadata table. If not provided, all possible combinations of values from - the metadata columns present in the --design but not in the - --reduced formula will be used (results will be merged giving - the priority to significantly differentially expressed genes - with higher absolute log2FoldChange values). + the metadata columns present in the --design will be used + (results will be merged giving the priority to significantly + differentially expressed genes with higher absolute + log2FoldChange values). remove: type: string? From 3852db43f1e7a6e0b625e7b0158ad979e8704ae6 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 11 Dec 2023 14:53:37 -0500 Subject: [PATCH 094/162] Remove deprecated workflows from the upstreams --- workflows/bedtools-multicov.cwl | 6 ------ workflows/deseq-lrt.cwl | 6 ------ workflows/deseq-multi-factor.cwl | 4 ---- workflows/deseq.cwl | 12 ------------ workflows/diffbind-multi-factor.cwl | 2 -- workflows/diffbind.cwl | 6 ------ workflows/feature-merge.cwl | 6 ------ workflows/filter-peaks-for-heatmap.cwl | 2 -- workflows/genelists-deseq-diffbind.cwl | 8 -------- workflows/heatmap.cwl | 2 -- workflows/homer-motif-analysis-peak.cwl | 4 ---- workflows/hopach.cwl | 6 ------ workflows/manorm-pe.cwl | 2 -- workflows/manorm-se.cwl | 2 -- workflows/pca.cwl | 6 ------ workflows/rgt-thor.cwl | 4 ---- workflows/super-enhancer.cwl | 4 ---- workflows/trim-chipseq-pe-cut-n-run.cwl | 1 - 18 files changed, 83 deletions(-) diff --git a/workflows/bedtools-multicov.cwl b/workflows/bedtools-multicov.cwl index 8838c3f6..9e3aa742 100644 --- a/workflows/bedtools-multicov.cwl +++ b/workflows/bedtools-multicov.cwl @@ -11,16 +11,10 @@ requirements: 'sd:upstream': sample: - - "chipseq-se.cwl" - - "chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-se.cwl" - "trim-atacseq-pe.cwl" - - "rnaseq-se.cwl" - - "rnaseq-pe.cwl" - - "rnaseq-se-dutp.cwl" - - "rnaseq-pe-dutp.cwl" - "trim-rnaseq-pe.cwl" - "trim-rnaseq-se.cwl" - "trim-rnaseq-pe-dutp.cwl" diff --git a/workflows/deseq-lrt.cwl b/workflows/deseq-lrt.cwl index 5311e347..58a74295 100644 --- a/workflows/deseq-lrt.cwl +++ b/workflows/deseq-lrt.cwl @@ -11,12 +11,6 @@ requirements: 'sd:upstream': rnaseq_experiment: - - "rnaseq-se.cwl" - - "rnaseq-pe.cwl" - - "rnaseq-se-dutp.cwl" - - "rnaseq-pe-dutp.cwl" - - "rnaseq-se-dutp-mitochondrial.cwl" - - "rnaseq-pe-dutp-mitochondrial.cwl" - "trim-rnaseq-pe.cwl" - "trim-rnaseq-se.cwl" - "trim-rnaseq-pe-dutp.cwl" diff --git a/workflows/deseq-multi-factor.cwl b/workflows/deseq-multi-factor.cwl index dc952991..01e6395a 100644 --- a/workflows/deseq-multi-factor.cwl +++ b/workflows/deseq-multi-factor.cwl @@ -19,10 +19,6 @@ requirements: 'sd:upstream': rnaseq_experiment: - - "rnaseq-se.cwl" - - "rnaseq-pe.cwl" - - "rnaseq-se-dutp.cwl" - - "rnaseq-pe-dutp.cwl" - "trim-rnaseq-pe.cwl" - "trim-rnaseq-se.cwl" - "trim-rnaseq-pe-dutp.cwl" diff --git a/workflows/deseq.cwl b/workflows/deseq.cwl index 95cd16a9..b5475866 100644 --- a/workflows/deseq.cwl +++ b/workflows/deseq.cwl @@ -12,12 +12,6 @@ requirements: 'sd:upstream': rnaseq_cond_1: - "mirna-mirdeep2-se.cwl" - - "rnaseq-se.cwl" - - "rnaseq-pe.cwl" - - "rnaseq-se-dutp.cwl" - - "rnaseq-pe-dutp.cwl" - - "rnaseq-se-dutp-mitochondrial.cwl" - - "rnaseq-pe-dutp-mitochondrial.cwl" - "trim-rnaseq-pe.cwl" - "trim-rnaseq-se.cwl" - "trim-rnaseq-pe-dutp.cwl" @@ -26,12 +20,6 @@ requirements: - "trim-quantseq-mrnaseq-se-strand-specific.cwl" rnaseq_cond_2: - "mirna-mirdeep2-se.cwl" - - "rnaseq-se.cwl" - - "rnaseq-pe.cwl" - - "rnaseq-se-dutp.cwl" - - "rnaseq-pe-dutp.cwl" - - "rnaseq-se-dutp-mitochondrial.cwl" - - "rnaseq-pe-dutp-mitochondrial.cwl" - "trim-rnaseq-pe.cwl" - "trim-rnaseq-se.cwl" - "trim-rnaseq-pe-dutp.cwl" diff --git a/workflows/diffbind-multi-factor.cwl b/workflows/diffbind-multi-factor.cwl index 44c731c2..e5d5f801 100644 --- a/workflows/diffbind-multi-factor.cwl +++ b/workflows/diffbind-multi-factor.cwl @@ -18,8 +18,6 @@ requirements: 'sd:upstream': dna_experiment: - - "chipseq-se.cwl" - - "chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-se.cwl" diff --git a/workflows/diffbind.cwl b/workflows/diffbind.cwl index 8a0ae308..08a4af38 100644 --- a/workflows/diffbind.cwl +++ b/workflows/diffbind.cwl @@ -10,8 +10,6 @@ requirements: 'sd:upstream': first_biological_condition: - - "chipseq-se.cwl" - - "chipseq-pe.cwl" - "cutandrun-macs2-pe.cwl" - "cutandrun-seacr-pe.cwl" - "trim-chipseq-se.cwl" @@ -19,8 +17,6 @@ requirements: - "trim-atacseq-se.cwl" - "trim-atacseq-pe.cwl" second_biological_condition: - - "chipseq-se.cwl" - - "chipseq-pe.cwl" - "cutandrun-macs2-pe.cwl" - "cutandrun-seacr-pe.cwl" - "trim-chipseq-se.cwl" @@ -28,8 +24,6 @@ requirements: - "trim-atacseq-se.cwl" - "trim-atacseq-pe.cwl" blocked_condition: - - "chipseq-se.cwl" - - "chipseq-pe.cwl" - "cutandrun-macs2-pe.cwl" - "cutandrun-seacr-pe.cwl" - "trim-chipseq-se.cwl" diff --git a/workflows/feature-merge.cwl b/workflows/feature-merge.cwl index 999b49c9..631fe6de 100644 --- a/workflows/feature-merge.cwl +++ b/workflows/feature-merge.cwl @@ -11,12 +11,6 @@ requirements: 'sd:upstream': rnaseq_sample: - - "rnaseq-se.cwl" - - "rnaseq-pe.cwl" - - "rnaseq-se-dutp.cwl" - - "rnaseq-pe-dutp.cwl" - - "rnaseq-se-dutp-mitochondrial.cwl" - - "rnaseq-pe-dutp-mitochondrial.cwl" - "trim-rnaseq-pe.cwl" - "trim-rnaseq-se.cwl" - "trim-rnaseq-pe-dutp.cwl" diff --git a/workflows/filter-peaks-for-heatmap.cwl b/workflows/filter-peaks-for-heatmap.cwl index 96c4993c..1783e0e2 100644 --- a/workflows/filter-peaks-for-heatmap.cwl +++ b/workflows/filter-peaks-for-heatmap.cwl @@ -11,8 +11,6 @@ requirements: 'sd:upstream': sample_to_filter: - - "chipseq-se.cwl" - - "chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-se.cwl" diff --git a/workflows/genelists-deseq-diffbind.cwl b/workflows/genelists-deseq-diffbind.cwl index 40a17a8f..01491efa 100644 --- a/workflows/genelists-deseq-diffbind.cwl +++ b/workflows/genelists-deseq-diffbind.cwl @@ -14,8 +14,6 @@ requirements: - "filter-deseq-for-heatmap.cwl" - "filter-diffbind-for-heatmap.cwl" samples_nabinding: - - "chipseq-se.cwl" - - "chipseq-pe.cwl" - "cutandrun-macs2-pe.cwl" - "cutandrun-seacr-pe.cwl" - "trim-chipseq-se.cwl" @@ -24,12 +22,6 @@ requirements: - "trim-atacseq-pe.cwl" samples_rnaseq: - "mirna-mirdeep2-se.cwl" - - "rnaseq-se.cwl" - - "rnaseq-pe.cwl" - - "rnaseq-se-dutp.cwl" - - "rnaseq-pe-dutp.cwl" - - "rnaseq-se-dutp-mitochondrial.cwl" - - "rnaseq-pe-dutp-mitochondrial.cwl" - "trim-rnaseq-pe.cwl" - "trim-rnaseq-se.cwl" - "trim-rnaseq-pe-dutp.cwl" diff --git a/workflows/heatmap.cwl b/workflows/heatmap.cwl index 2a1b11d6..f0a15aa9 100644 --- a/workflows/heatmap.cwl +++ b/workflows/heatmap.cwl @@ -14,8 +14,6 @@ requirements: 'sd:upstream': chipseq_sample: - - "chipseq-se.cwl" - - "chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-se.cwl" diff --git a/workflows/homer-motif-analysis-peak.cwl b/workflows/homer-motif-analysis-peak.cwl index 93fddde9..6847f944 100644 --- a/workflows/homer-motif-analysis-peak.cwl +++ b/workflows/homer-motif-analysis-peak.cwl @@ -12,15 +12,11 @@ requirements: genome_indices: - "genome-indices.cwl" regions_a: - - "chipseq-se.cwl" - - "chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-se.cwl" - "trim-atacseq-pe.cwl" regions_b: - - "chipseq-se.cwl" - - "chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-se.cwl" diff --git a/workflows/hopach.cwl b/workflows/hopach.cwl index db294ccd..6456446c 100644 --- a/workflows/hopach.cwl +++ b/workflows/hopach.cwl @@ -11,12 +11,6 @@ requirements: 'sd:upstream': rnaseq_sample: - - "rnaseq-se.cwl" - - "rnaseq-pe.cwl" - - "rnaseq-se-dutp.cwl" - - "rnaseq-pe-dutp.cwl" - - "rnaseq-se-dutp-mitochondrial.cwl" - - "rnaseq-pe-dutp-mitochondrial.cwl" - "trim-rnaseq-pe.cwl" - "trim-rnaseq-se.cwl" - "trim-rnaseq-pe-dutp.cwl" diff --git a/workflows/manorm-pe.cwl b/workflows/manorm-pe.cwl index 9e968387..1b008f8b 100644 --- a/workflows/manorm-pe.cwl +++ b/workflows/manorm-pe.cwl @@ -10,13 +10,11 @@ requirements: 'sd:upstream': first_chipseq_sample: - - "chipseq-pe.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-pe.cwl" - "cutandrun-macs2-pe.cwl" - "cutandrun-seacr-pe.cwl" second_chipseq_sample: - - "chipseq-pe.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-pe.cwl" - "cutandrun-macs2-pe.cwl" diff --git a/workflows/manorm-se.cwl b/workflows/manorm-se.cwl index 79c7bf56..9cbc0336 100644 --- a/workflows/manorm-se.cwl +++ b/workflows/manorm-se.cwl @@ -10,11 +10,9 @@ requirements: 'sd:upstream': first_chipseq_sample: - - "chipseq-se.cwl" - "trim-chipseq-se.cwl" - "trim-atacseq-se.cwl" second_chipseq_sample: - - "chipseq-se.cwl" - "trim-chipseq-se.cwl" - "trim-atacseq-se.cwl" diff --git a/workflows/pca.cwl b/workflows/pca.cwl index 006c1372..f9d46029 100644 --- a/workflows/pca.cwl +++ b/workflows/pca.cwl @@ -11,12 +11,6 @@ requirements: 'sd:upstream': rnaseq_sample: - - "rnaseq-se.cwl" - - "rnaseq-pe.cwl" - - "rnaseq-se-dutp.cwl" - - "rnaseq-pe-dutp.cwl" - - "rnaseq-se-dutp-mitochondrial.cwl" - - "rnaseq-pe-dutp-mitochondrial.cwl" - "trim-rnaseq-pe.cwl" - "trim-rnaseq-se.cwl" - "trim-rnaseq-pe-dutp.cwl" diff --git a/workflows/rgt-thor.cwl b/workflows/rgt-thor.cwl index 04f607ec..7bcf6f04 100644 --- a/workflows/rgt-thor.cwl +++ b/workflows/rgt-thor.cwl @@ -10,15 +10,11 @@ requirements: 'sd:upstream': first_biological_condition: - - "chipseq-se.cwl" - - "chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-se.cwl" - "trim-atacseq-pe.cwl" second_biological_condition: - - "chipseq-se.cwl" - - "chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-se.cwl" diff --git a/workflows/super-enhancer.cwl b/workflows/super-enhancer.cwl index 07dbadf1..16c3f4d1 100644 --- a/workflows/super-enhancer.cwl +++ b/workflows/super-enhancer.cwl @@ -10,14 +10,10 @@ requirements: 'sd:upstream': chipseq_sample: - - "chipseq-se.cwl" - - "chipseq-pe.cwl" - "trim-chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "cutandrun-pe.cwl" chipseq_control: - - "chipseq-se.cwl" - - "chipseq-pe.cwl" - "trim-chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "cutandrun-pe.cwl" diff --git a/workflows/trim-chipseq-pe-cut-n-run.cwl b/workflows/trim-chipseq-pe-cut-n-run.cwl index 04908f44..a80fdac6 100644 --- a/workflows/trim-chipseq-pe-cut-n-run.cwl +++ b/workflows/trim-chipseq-pe-cut-n-run.cwl @@ -380,7 +380,6 @@ s:creator: doc: | Experimental pipeline for Cut-n-Run analysis. Uses mapping results from the following experiment types: - - `chipseq-pe.cwl` - `trim-chipseq-pe.cwl` - `trim-atacseq-pe.cwl` From 9bdb36c4ba4ce0f352f120bb33da5ccc5378e63d Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Sat, 23 Dec 2023 13:25:14 -0500 Subject: [PATCH 095/162] Update threads options to ARC pipelines --- workflows/cellranger-arc-aggr.cwl | 2 ++ workflows/cellranger-arc-count.cwl | 2 ++ 2 files changed, 4 insertions(+) diff --git a/workflows/cellranger-arc-aggr.cwl b/workflows/cellranger-arc-aggr.cwl index d34727a8..8fe650d6 100644 --- a/workflows/cellranger-arc-aggr.cwl +++ b/workflows/cellranger-arc-aggr.cwl @@ -94,6 +94,8 @@ inputs: - "2" - "3" - "4" + - "5" + - "6" default: "4" label: "Cores/CPUs" doc: | diff --git a/workflows/cellranger-arc-count.cwl b/workflows/cellranger-arc-count.cwl index 7ec8ef24..d91590e0 100644 --- a/workflows/cellranger-arc-count.cwl +++ b/workflows/cellranger-arc-count.cwl @@ -128,6 +128,8 @@ inputs: - "2" - "3" - "4" + - "5" + - "6" default: "4" label: "Cores/CPUs" doc: | From 635a13c78354caf376b247a48bd2e106e8d3d138 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 27 Dec 2023 16:19:33 -0500 Subject: [PATCH 096/162] Remove multiple format fields from unputs --- workflows/trim-quantseq-mrnaseq-se-strand-specific.cwl | 4 +--- workflows/trim-quantseq-mrnaseq-se.cwl | 4 +--- workflows/trim-rnaseq-pe-dutp.cwl | 4 +--- workflows/trim-rnaseq-pe-smarter-dutp.cwl | 4 +--- workflows/trim-rnaseq-pe.cwl | 4 +--- workflows/trim-rnaseq-se-dutp.cwl | 4 +--- workflows/trim-rnaseq-se.cwl | 4 +--- 7 files changed, 7 insertions(+), 21 deletions(-) diff --git a/workflows/trim-quantseq-mrnaseq-se-strand-specific.cwl b/workflows/trim-quantseq-mrnaseq-se-strand-specific.cwl index 8a48978c..c8dd8f5d 100644 --- a/workflows/trim-quantseq-mrnaseq-se-strand-specific.cwl +++ b/workflows/trim-quantseq-mrnaseq-se-strand-specific.cwl @@ -44,9 +44,7 @@ inputs: annotation_file: type: File label: "Annotation file" - format: - - "http://edamontology.org/format_2306" - - "http://edamontology.org/format_3475" + format: "http://edamontology.org/format_3475" 'sd:upstreamSource': "genome_indices/annotation" doc: "GTF or TAB-separated annotation file" diff --git a/workflows/trim-quantseq-mrnaseq-se.cwl b/workflows/trim-quantseq-mrnaseq-se.cwl index 54d763dd..7d111ad9 100644 --- a/workflows/trim-quantseq-mrnaseq-se.cwl +++ b/workflows/trim-quantseq-mrnaseq-se.cwl @@ -45,9 +45,7 @@ inputs: annotation_file: type: File label: "Annotation file" - format: - - "http://edamontology.org/format_2306" - - "http://edamontology.org/format_3475" + format: "http://edamontology.org/format_3475" 'sd:upstreamSource': "genome_indices/annotation" doc: "GTF or TAB-separated annotation file" diff --git a/workflows/trim-rnaseq-pe-dutp.cwl b/workflows/trim-rnaseq-pe-dutp.cwl index 03a27b0d..d8ef358a 100644 --- a/workflows/trim-rnaseq-pe-dutp.cwl +++ b/workflows/trim-rnaseq-pe-dutp.cwl @@ -46,9 +46,7 @@ inputs: annotation_file: type: File label: "Annotation file" - format: - - "http://edamontology.org/format_2306" - - "http://edamontology.org/format_3475" + format: "http://edamontology.org/format_3475" 'sd:upstreamSource': "genome_indices/annotation" doc: "GTF or TAB-separated annotation file" diff --git a/workflows/trim-rnaseq-pe-smarter-dutp.cwl b/workflows/trim-rnaseq-pe-smarter-dutp.cwl index 36baa3c5..db952827 100644 --- a/workflows/trim-rnaseq-pe-smarter-dutp.cwl +++ b/workflows/trim-rnaseq-pe-smarter-dutp.cwl @@ -46,9 +46,7 @@ inputs: annotation_file: type: File label: "Annotation file" - format: - - "http://edamontology.org/format_2306" - - "http://edamontology.org/format_3475" + format: "http://edamontology.org/format_3475" 'sd:upstreamSource': "genome_indices/annotation" doc: "GTF or TAB-separated annotation file" diff --git a/workflows/trim-rnaseq-pe.cwl b/workflows/trim-rnaseq-pe.cwl index 0fd9afea..4ddedfa0 100644 --- a/workflows/trim-rnaseq-pe.cwl +++ b/workflows/trim-rnaseq-pe.cwl @@ -45,9 +45,7 @@ inputs: annotation_file: type: File label: "Annotation file" - format: - - "http://edamontology.org/format_2306" - - "http://edamontology.org/format_3475" + format: "http://edamontology.org/format_3475" 'sd:upstreamSource': "genome_indices/annotation" doc: "GTF or TAB-separated annotation file" diff --git a/workflows/trim-rnaseq-se-dutp.cwl b/workflows/trim-rnaseq-se-dutp.cwl index c77e3824..d32a8556 100644 --- a/workflows/trim-rnaseq-se-dutp.cwl +++ b/workflows/trim-rnaseq-se-dutp.cwl @@ -44,9 +44,7 @@ inputs: annotation_file: type: File label: "Annotation file" - format: - - "http://edamontology.org/format_2306" - - "http://edamontology.org/format_3475" + format: "http://edamontology.org/format_3475" 'sd:upstreamSource': "genome_indices/annotation" doc: "GTF or TAB-separated annotation file" diff --git a/workflows/trim-rnaseq-se.cwl b/workflows/trim-rnaseq-se.cwl index d4054f93..d2de7864 100644 --- a/workflows/trim-rnaseq-se.cwl +++ b/workflows/trim-rnaseq-se.cwl @@ -45,9 +45,7 @@ inputs: annotation_file: type: File label: "Annotation file" - format: - - "http://edamontology.org/format_2306" - - "http://edamontology.org/format_3475" + format: "http://edamontology.org/format_3475" 'sd:upstreamSource': "genome_indices/annotation" doc: "GTF or TAB-separated annotation file" From 72039b9b974a6e8d4c7de0841829392e711c6e87 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 4 Jan 2024 12:24:18 -0500 Subject: [PATCH 097/162] Add example of the datasets grouping to sc aggr pipelines --- tools/cellranger-aggr.cwl | 28 +++++++++++++++++++++++----- tools/cellranger-arc-aggr.cwl | 23 +++++++++++++++++++---- workflows/cellranger-aggr.cwl | 7 +++++++ workflows/cellranger-arc-aggr.cwl | 11 ++++++++++- 4 files changed, 59 insertions(+), 10 deletions(-) diff --git a/tools/cellranger-aggr.cwl b/tools/cellranger-aggr.cwl index e665ec31..32a8153a 100644 --- a/tools/cellranger-aggr.cwl +++ b/tools/cellranger-aggr.cwl @@ -13,10 +13,12 @@ requirements: - class: InitialWorkDirRequirement listing: | ${ + var grouping = "library_id\tcondition\n" if (inputs.molecule_info_h5 != null){ var entry = "sample_id,molecule_h5\n" for (var i=0; i < inputs.molecule_info_h5.length; i++){ entry += get_label(inputs.molecule_info_h5, i) + "," + inputs.molecule_info_h5[i].path + "\n" + grouping += get_label(inputs.molecule_info_h5, i) + "\t" + get_label(inputs.molecule_info_h5, i) + "\n" } } else if (inputs.filtered_data_folder != null){ var entry = "sample_id,sample_outs,donor,origin\n" @@ -30,15 +32,24 @@ requirements: origin = "origin_" + i } entry += get_label(inputs.filtered_data_folder, i) + "," + inputs.filtered_data_folder[i].path + "," + donor + "," + origin + "\n" + grouping += get_label(inputs.filtered_data_folder, i) + "\t" + get_label(inputs.filtered_data_folder, i) + "\n" } } else { var entry = "neither molecule_info_h5 nor filtered_data_folder was provided" + var grouping = "neither molecule_info_h5 nor filtered_data_folder was provided" } - return [{ - "entry": entry, - "entryname": "metadata.csv", - "writable": true - }]; + return [ + { + "entry": entry, + "entryname": "metadata.csv", + "writable": true + }, + { + "entry": grouping, + "entryname": "grouping.tsv", + "writable": true + } + ]; } @@ -198,6 +209,13 @@ outputs: doc: | Copy of the input aggregation CSV file + grouping_data: + type: File + outputBinding: + glob: "grouping.tsv" + doc: | + Example of TSV file to define datasets grouping + loupe_browser_track: type: File outputBinding: diff --git a/tools/cellranger-arc-aggr.cwl b/tools/cellranger-arc-aggr.cwl index f0aa87d8..608d57aa 100644 --- a/tools/cellranger-arc-aggr.cwl +++ b/tools/cellranger-arc-aggr.cwl @@ -14,13 +14,21 @@ requirements: listing: | ${ var entry = "library_id,atac_fragments,per_barcode_metrics,gex_molecule_info\n" + var grouping = "library_id\tcondition\n" for (var i=0; i < inputs.gex_molecule_info_h5.length; i++){ entry += get_label(i) + "," + inputs.atac_fragments_file_from_count[i].path + "," + inputs.barcode_metrics_report[i].path + "," + inputs.gex_molecule_info_h5[i].path + "\n" + grouping += get_label(i) + "\t" + get_label(i) + "\n" } - return [{ - "entry": entry, - "entryname": "metadata.csv" - }]; + return [ + { + "entry": entry, + "entryname": "metadata.csv" + }, + { + "entry": grouping, + "entryname": "grouping.tsv" + } + ]; } @@ -202,6 +210,13 @@ outputs: doc: | Copy of the input aggregation CSV file + grouping_data: + type: File + outputBinding: + glob: "grouping.tsv" + doc: | + Example of TSV file to define datasets grouping + loupe_browser_track: type: File outputBinding: diff --git a/workflows/cellranger-aggr.cwl b/workflows/cellranger-aggr.cwl index a4b6aba6..0e343304 100644 --- a/workflows/cellranger-aggr.cwl +++ b/workflows/cellranger-aggr.cwl @@ -144,6 +144,12 @@ outputs: label: "Aggregation metadata in CSV format" doc: "Aggregation metadata in CSV format" + grouping_data: + type: File + outputSource: aggregate_counts/grouping_data + label: "Example of datasets grouping" + doc: "Example of TSV file to define datasets grouping" + loupe_browser_track: type: File outputSource: aggregate_counts/loupe_browser_track @@ -239,6 +245,7 @@ steps: - filtered_feature_bc_matrix_folder - filtered_feature_bc_matrix_h5 - aggregation_metadata + - grouping_data - loupe_browser_track - clonotypes_csv - consensus_sequences_fasta diff --git a/workflows/cellranger-arc-aggr.cwl b/workflows/cellranger-arc-aggr.cwl index 8fe650d6..3c969b45 100644 --- a/workflows/cellranger-arc-aggr.cwl +++ b/workflows/cellranger-arc-aggr.cwl @@ -59,7 +59,8 @@ inputs: 'sd:localLabel': true memory_limit: - type: int + type: int? + default: 20 'sd:upstreamSource': "genome_indices/memory_limit" normalization_mode: @@ -147,6 +148,13 @@ outputs: Aggregation metadata file in CSV format + grouping_data: + type: File + outputSource: aggregate_counts/grouping_data + label: "Example of datasets grouping" + doc: | + Example of TSV file to define datasets grouping + filtered_feature_bc_matrix_folder: type: File outputSource: compress_filtered_feature_bc_matrix_folder/compressed_folder @@ -304,6 +312,7 @@ steps: - raw_feature_bc_matrices_folder - raw_feature_bc_matrices_h5 - aggregation_metadata + - grouping_data - loupe_browser_track - stdout_log - stderr_log From 1aeff4551fc79c524cb4f15cefdd8631f8ebb27c Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 10 Jan 2024 14:37:54 -0500 Subject: [PATCH 098/162] Update dimentions to be an interger, fix bug in sc ATAC Dim Reduc --- tools/sc-atac-cluster.cwl | 13 +++++-------- tools/sc-atac-coverage.cwl | 2 +- tools/sc-atac-dbinding.cwl | 2 +- tools/sc-atac-reduce.cwl | 23 ++++++++--------------- tools/sc-ctype-assign.cwl | 2 +- tools/sc-multiome-filter.cwl | 2 +- tools/sc-rna-cluster.cwl | 16 +++++----------- tools/sc-rna-da-cells.cwl | 11 +++-------- tools/sc-rna-de-pseudobulk.cwl | 2 +- tools/sc-rna-filter.cwl | 2 +- tools/sc-rna-reduce.cwl | 18 ++++++------------ tools/sc-rna-trajectory.cwl | 12 +++--------- tools/sc-triangulate.cwl | 2 +- tools/sc-vdj-profile.cwl | 2 +- tools/sc-wnn-cluster.cwl | 26 +++++++++----------------- workflows/sc-atac-reduce.cwl | 4 +--- 16 files changed, 48 insertions(+), 91 deletions(-) diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index e9e75a8f..a08204f0 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.31 + dockerPull: biowardrobe2/sc-tools:v0.0.32 inputs: @@ -26,17 +26,14 @@ inputs: 'atac_lsi' and 'atacumap' dimensionality reductions applied to that assay. dimensions: - type: - - "null" - - int - - int[] + type: int? inputBinding: prefix: "--dimensions" doc: | Dimensionality to use when constructing nearest-neighbor graph before clustering - (from 1 to 50). If single value N is provided, use from 2 to N dimensions. If - multiple values are provided, subset to only selected dimensions. - Default: from 2 to 10 + (from 2 to 50). First LSI component is always excluded unless the provided RDS + file consists of multiple datasets integrated with Harmony. + Default: 10 cluster_metric: type: diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index f6d313a0..0c91c2e4 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.31 + dockerPull: biowardrobe2/sc-tools:v0.0.32 inputs: diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index bb946fff..30ee25d6 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.31 + dockerPull: biowardrobe2/sc-tools:v0.0.32 inputs: diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index 4ad66409..b0304043 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.31 + dockerPull: biowardrobe2/sc-tools:v0.0.32 inputs: @@ -84,8 +84,8 @@ inputs: prefix: "--ntgr" doc: | Integration method used for joint analysis of multiple - datasets. Automatically set to 'none' if loaded Suerat - object includes only one dataset. Default: signac + datasets. + Default: signac integrate_by: type: @@ -115,21 +115,14 @@ inputs: Default: 0 (use all available peaks) dimensions: - type: - - "null" - - int - - int[] + type: int? inputBinding: prefix: "--dimensions" doc: | - Dimensionality to use for datasets integration and - UMAP projection (from 2 to 50). If single value N is - provided, use from 2 to N LSI components. If multiple - values are provided, subset to only selected LSI - components. In combination with --ntgr set to harmony, - multiple values will result in using all dimensions - starting from 1(!) to the max of the provided values. - Default: from 2 to 10 + Dimensionality to use for datasets integration (if provided RDS file includes + multiple datasets and --ntgr is not set to 'none') and UMAP projection. + (from 2 to 50). First LSI component is always excluded. + Default: 10 umap_spread: type: float? diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index 3bff1746..0f0dc3a2 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.31 + dockerPull: biowardrobe2/sc-tools:v0.0.32 inputs: diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index 37c5f837..0954ae5f 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.31 + dockerPull: biowardrobe2/sc-tools:v0.0.32 inputs: diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index bcbecf55..4ac3d21e 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.31 + dockerPull: biowardrobe2/sc-tools:v0.0.32 inputs: @@ -26,19 +26,13 @@ inputs: dimensionality reductions applied to that assay. dimensions: - type: - - "null" - - int - - int[] + type: int? inputBinding: prefix: "--dimensions" doc: | - Dimensionality to use when constructing nearest- - neighbor graph before clustering (from 1 to 50). If - single value N is provided, use from 1 to N - dimensions. If multiple values are provided, subset to - only selected dimensions. - Default: from 1 to 10 + Dimensionality to use when constructing nearest-neighbor + graph before clustering (from 1 to 50). + Default: 10 cluster_metric: type: diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index 7814b19e..2a1f0f18 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.31 + dockerPull: biowardrobe2/sc-tools:v0.0.32 inputs: @@ -35,17 +35,12 @@ inputs: Default: pca dimensions: - type: - - "null" - - int - - int[] + type: int? inputBinding: prefix: "--dimensions" doc: | Dimensionality to use when running DA analysis (from 1 to 50). - If single value N is provided, use from 1 to N PCs. If multiple - values are provided, subset to only selected PCs. - Default: from 1 to 10 + Default: 10 score_vector_knn: type: diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index fd50cb8e..997996d6 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.31 + dockerPull: biowardrobe2/sc-tools:v0.0.32 inputs: diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index 73fa7009..1d883441 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.31 + dockerPull: biowardrobe2/sc-tools:v0.0.32 inputs: diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index 213b0a5d..745635d4 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.31 + dockerPull: biowardrobe2/sc-tools:v0.0.32 inputs: @@ -177,20 +177,14 @@ inputs: Default: false dimensions: - type: - - "null" - - int - - int[] + type: int? inputBinding: prefix: "--dimensions" doc: | - Dimensionality to use in UMAP projection (from 1 to - 50). If single value N is provided, use from 1 to N - PCs. If multiple values are provided, subset to only - specified PCs. In combination with --ntgr set to - harmony, multiple values will result in using all - principal components starting from 1 to the max of the - provided values. Default: from 1 to 10 + Dimensionality to use for datasets integration (if provided RDS + file includes multiple datasets and --ntgr is not set to 'harmony') + and UMAP projection (from 1 to 50). + Default: 10 umap_spread: type: float? diff --git a/tools/sc-rna-trajectory.cwl b/tools/sc-rna-trajectory.cwl index c359e3e0..540a5efc 100644 --- a/tools/sc-rna-trajectory.cwl +++ b/tools/sc-rna-trajectory.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.31 + dockerPull: biowardrobe2/sc-tools:v0.0.32 inputs: @@ -34,17 +34,11 @@ inputs: Default: pca dimensions: - type: - - "null" - - int - - int[] + type: int? inputBinding: prefix: "--dimensions" doc: | - Dimensionality to use (from 1 to 50). If single value N is provided, - use from 1 to N dimensions. If multiple values are provided, subset - to only selected dimensions. May fail if user specified more dimensions - than it was available in the selected --reduction. + Dimensionality to use (from 1 to 50). Default: use all available dimensions query_source_column: diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index b9debd5c..debe1e31 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.31 + dockerPull: biowardrobe2/sc-tools:v0.0.32 inputs: diff --git a/tools/sc-vdj-profile.cwl b/tools/sc-vdj-profile.cwl index 74bf9a15..116e1d2c 100644 --- a/tools/sc-vdj-profile.cwl +++ b/tools/sc-vdj-profile.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.31 + dockerPull: biowardrobe2/sc-tools:v0.0.32 inputs: diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index 74190dac..59dddfa7 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.31 + dockerPull: biowardrobe2/sc-tools:v0.0.32 inputs: @@ -27,32 +27,24 @@ inputs: and 'atacumap' dimensionality reductions should be present. rna_dimensions: - type: - - "null" - - int - - int[] + type: int? inputBinding: prefix: "--rnadimensions" doc: | Dimensionality from the 'pca' reduction to use when constructing weighted - nearest-neighbor graph before clustering (from 1 to 50). If single value N - is provided, use from 1 to N dimensions. If multiple values are provided, - subset to only selected dimensions. - Default: from 1 to 10 + nearest-neighbor graph before clustering (from 1 to 50). + Default: 10 atac_dimensions: - type: - - "null" - - int - - int[] + type: int? inputBinding: prefix: "--atacdimensions" doc: | Dimensionality from the 'atac_lsi' reduction to use when constructing weighted - nearest-neighbor graph before clustering (from 1 to 50). If single value N - is provided, use from 2 to N dimensions. If multiple values are provided, - subset to only selected dimensions. - Default: from 2 to 10 + nearest-neighbor graph before clustering (from 2 to 50). First LSI component is + always excluded unless the provided RDS file consists of multiple datasets + where ATAC assay were integrated with Harmony. + Default: 10 cluster_algorithm: type: diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index 16393864..8ed66b67 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -115,9 +115,7 @@ inputs: doc: | Number of dimensions to be used in LSI, datasets integration, and UMAP projection. - Accepted values range from 2 to 50. First - dimension is always excluded, unless - "Integration method" is set to "harmony". + Accepted values range from 2 to 50. Default: 40 datasets_metadata: From 80284386e8aecf7dadbb8437baf8380b90f5c5f3 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 11 Jan 2024 23:43:53 -0500 Subject: [PATCH 099/162] Update sc ctype assign pipeline to have --splitby parameter Docker is rebuilt with the same tag --- tools/sc-atac-cluster.cwl | 62 ++++++++++++++++++------- tools/sc-atac-reduce.cwl | 56 ++++++++++------------ tools/sc-ctype-assign.cwl | 44 +++++++++++++++++- tools/sc-rna-cluster.cwl | 35 +++++++------- tools/sc-rna-da-cells.cwl | 26 ++++++----- tools/sc-rna-reduce.cwl | 53 +++++++++++---------- tools/sc-rna-trajectory.cwl | 19 ++++---- tools/sc-wnn-cluster.cwl | 87 ++++++++++++++++++++++++----------- workflows/sc-ctype-assign.cwl | 28 +++++++++++ 9 files changed, 270 insertions(+), 140 deletions(-) diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index a08204f0..8b8d3c5e 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -103,6 +103,24 @@ inputs: Ignored if '--fragments' is not provided. Default: None + cvrg_upstream_bp: + type: int? + inputBinding: + prefix: "--upstream" + doc: | + Number of bases to extend the genome coverage region for + a specific gene upstream. Ignored if --genes or --fragments + parameters are not provided. Default: 2500 + + cvrg_downstream_bp: + type: int? + inputBinding: + prefix: "--downstream" + doc: | + Number of bases to extend the genome coverage region for + a specific gene downstream. Ignored if --genes or --fragments + parameters are not provided. Default: 2500 + identify_diff_peaks: type: boolean? inputBinding: @@ -582,17 +600,22 @@ doc: | s:about: | - usage: sc_atac_cluster.R - [-h] --query QUERY [--dimensions [DIMENSIONS [DIMENSIONS ...]]] - [--ametric {euclidean,cosine,manhattan,hamming}] - [--algorithm {louvain,mult-louvain,slm,leiden}] - [--resolution [RESOLUTION [RESOLUTION ...]]] [--fragments FRAGMENTS] - [--genes [GENES [GENES ...]]] [--diffpeaks] [--logfc LOGFC] - [--minpct MINPCT] - [--testuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] - [--pdf] [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] - [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] - [--cpus CPUS] [--memory MEMORY] + usage: sc_atac_cluster.R [-h] --query QUERY + [--dimensions DIMENSIONS] + [--ametric {euclidean,cosine,manhattan,hamming}] + [--algorithm {louvain,mult-louvain,slm,leiden}] + [--resolution [RESOLUTION [RESOLUTION ...]]] + [--fragments FRAGMENTS] + [--genes [GENES [GENES ...]]] + [--upstream UPSTREAM] + [--downstream DOWNSTREAM] + [--diffpeaks] [--logfc LOGFC] + [--minpct MINPCT] + [--testuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] + [--pdf] [--verbose] [--h5seurat] + [--h5ad] [--cbbuild] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] Single-cell ATAC-Seq Cluster Analysis @@ -603,12 +626,12 @@ s:about: | information stored in the ATAC assay, as well as 'atac_lsi' and 'atacumap' dimensionality reductions applied to that assay. - --dimensions [DIMENSIONS [DIMENSIONS ...]] + --dimensions DIMENSIONS Dimensionality to use when constructing nearest- - neighbor graph before clustering (from 1 to 50). If - single value N is provided, use from 2 to N - dimensions. If multiple values are provided, subset to - only selected dimensions. Default: from 2 to 10 + neighbor graph before clustering (from 2 to 50). First + LSI component is always excluded unless the provided + RDS file consists of multiple datasets integrated with + Harmony. Default: 10 --ametric {euclidean,cosine,manhattan,hamming} Distance metric used when constructing nearest- neighbor graph before clustering. Default: euclidean @@ -633,6 +656,13 @@ s:about: | it will be additionally shown on the right side of the plots. Ignored if '--fragments' is not provided. Default: None + --upstream UPSTREAM Number of bases to extend the genome coverage region + for a specific gene upstream. Ignored if --genes or + --fragments parameters are not provided. Default: 2500 + --downstream DOWNSTREAM + Number of bases to extend the genome coverage region + for a specific gene downstream. Ignored if --genes or + --fragments parameters are not provided. Default: 2500 --diffpeaks Identify differentially accessible peaks between each pair of clusters for all resolutions. Default: false --logfc LOGFC For differentially accessible peaks identification diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index b0304043..2098bd11 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -705,22 +705,23 @@ doc: | s:about: | - usage: sc_atac_reduce.R - [-h] --query QUERY [--metadata METADATA] [--barcodes BARCODES] - [--norm {log-tfidf,tf-logidf,logtf-logidf,idf}] - [--ntgr {signac,harmony,none}] [--ntgrby [NTGRBY [NTGRBY ...]]] - [--minvarpeaks MINVARPEAKS] - [--dimensions [DIMENSIONS [DIMENSIONS ...]]] [--uspread USPREAD] - [--umindist UMINDIST] [--uneighbors UNEIGHBORS] - [--umetric {euclidean,manhattan,chebyshev,minkowski,canberra, - braycurtis,mahalanobis,wminkowski,seuclidean,cosine, - correlation,haversine,hamming,jaccard,dice,russelrao, - kulsinski,ll_dirichlet,hellinger,rogerstanimoto, - sokalmichener,sokalsneath,yule}] - [--umethod {uwot,uwot-learn,umap-learn}] [--pdf] [--verbose] - [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] - [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] - [--cpus CPUS] [--memory MEMORY] + usage: sc_atac_reduce.R [-h] --query QUERY + [--metadata METADATA] + [--barcodes BARCODES] + [--norm {log-tfidf,tf-logidf,logtf-logidf,idf}] + [--ntgr {signac,harmony,none}] + [--ntgrby [NTGRBY [NTGRBY ...]]] + [--minvarpeaks MINVARPEAKS] + [--dimensions DIMENSIONS] + [--uspread USPREAD] + [--umindist UMINDIST] + [--uneighbors UNEIGHBORS] + [--umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis,mahalanobis,wminkowski,seuclidean,cosine,correlation,haversine,hamming,jaccard,dice,russelrao,kulsinski,ll_dirichlet,hellinger,rogerstanimoto,sokalmichener,sokalsneath,yule}] + [--umethod {uwot,uwot-learn,umap-learn}] + [--pdf] [--verbose] [--h5seurat] + [--h5ad] [--cbbuild] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] Single-cell ATAC-Seq Dimensionality Reduction Analysis @@ -754,8 +755,7 @@ s:about: | Default: log-tfidf --ntgr {signac,harmony,none} Integration method used for joint analysis of multiple - datasets. Automatically set to 'none' if loaded Suerat - object includes only one dataset. Default: signac + datasets. Default: signac --ntgrby [NTGRBY [NTGRBY ...]] Column(s) from the Seurat object metadata to define the variable(s) that should be integrated out when @@ -770,15 +770,12 @@ s:about: | cells peaks as highly variable. These peaks are used for datasets integration, scaling and dimensionality reduction. Default: 0 (use all available peaks) - --dimensions [DIMENSIONS [DIMENSIONS ...]] - Dimensionality to use for datasets integration and - UMAP projection (from 2 to 50). If single value N is - provided, use from 2 to N LSI components. If multiple - values are provided, subset to only selected LSI - components. In combination with --ntgr set to harmony, - multiple values will result in using all dimensions - starting from 1(!) to the max of the provided values. - Default: from 2 to 10 + --dimensions DIMENSIONS + Dimensionality to use for datasets integration (if + provided RDS file includes multiple datasets and + --ntgr is not set to 'none') and UMAP projection. + (from 2 to 50). First LSI component is always + excluded. Default: 10 --uspread USPREAD The effective scale of embedded points on UMAP. In combination with '--mindist' it determines how clustered/clumped the embedded points are. Default: 1 @@ -794,10 +791,7 @@ s:about: | structure being preserved at the loss of detailed local structure. In general this parameter should often be in the range 5 to 50. Default: 30 - --umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis, - mahalanobis,wminkowski,seuclidean,cosine,correlation,haversine, - hamming,jaccard,dice,russelrao,kulsinski,ll_dirichlet,hellinger, - rogerstanimoto,sokalmichener,sokalsneath,yule} + --umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis,mahalanobis,wminkowski,seuclidean,cosine,correlation,haversine,hamming,jaccard,dice,russelrao,kulsinski,ll_dirichlet,hellinger,rogerstanimoto,sokalmichener,sokalsneath,yule} The metric to use to compute distances in high dimensional space for UMAP. Default: cosine --umethod {uwot,uwot-learn,umap-learn} diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index 0f0dc3a2..03cf11bc 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -50,6 +50,15 @@ inputs: assigned cell types. Should start with 'custom_', otherwise, it won't be shown in UCSC Cell Browser. + query_splitby_column: + type: string? + inputBinding: + prefix: "--splitby" + doc: | + Column from the Seurat object metadata to additionally split + every cluster selected with --source into smaller groups. + Default: do not split + identify_diff_genes: type: boolean? inputBinding: @@ -190,6 +199,24 @@ inputs: file should be provided. Default: None + cvrg_upstream_bp: + type: int? + inputBinding: + prefix: "--upstream" + doc: | + Number of bases to extend the genome coverage region for + a specific gene upstream. Ignored if --genes or --fragments + parameters are not provided. Default: 2500 + + cvrg_downstream_bp: + type: int? + inputBinding: + prefix: "--downstream" + doc: | + Number of bases to extend the genome coverage region for + a specific gene downstream. Ignored if --genes or --fragments + parameters are not provided. Default: 2500 + export_pdf_plots: type: boolean? inputBinding: @@ -981,7 +1008,8 @@ doc: | s:about: | usage: sc_ctype_assign.R [-h] --query QUERY --celltypes CELLTYPES --source SOURCE --target - TARGET [--diffgenes] [--diffpeaks] + TARGET [--splitby SPLITBY] + [--diffgenes] [--diffpeaks] [--rnalogfc RNALOGFC] [--rnaminpct RNAMINPCT] [--rnaonlypos] [--rnatestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] @@ -989,7 +1017,9 @@ s:about: | [--atacminpct ATACMINPCT] [--atactestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] [--fragments FRAGMENTS] - [--genes [GENES [GENES ...]]] [--pdf] + [--genes [GENES [GENES ...]]] + [--upstream UPSTREAM] + [--downstream DOWNSTREAM] [--pdf] [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--scope] [--output OUTPUT] @@ -1016,6 +1046,9 @@ s:about: | to save manually assigned cell types. Should start with 'custom_', otherwise, it won't be shown in UCSC Cell Browser. + --splitby SPLITBY Column from the Seurat object metadata to additionally + split every cluster selected with --source into + smaller groups. Default: do not split --diffgenes Identify differentially expressed genes (putative gene markers) for assigned cell types. Ignored if loaded Seurat object doesn't include genes expression @@ -1073,6 +1106,13 @@ s:about: | frequency plots for the nearest peaks the loaded Seurat object should include ATAC assay as well as the --fragments file should be provided. Default: None + --upstream UPSTREAM Number of bases to extend the genome coverage region + for a specific gene upstream. Ignored if --genes or + --fragments parameters are not provided. Default: 2500 + --downstream DOWNSTREAM + Number of bases to extend the genome coverage region + for a specific gene downstream. Ignored if --genes or + --fragments parameters are not provided. Default: 2500 --pdf Export plots in PDF. Default: false --verbose Print debug information. Default: false --h5seurat Save Seurat data to h5seurat file. Default: false diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index 4ac3d21e..f17f312b 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -769,17 +769,20 @@ doc: | s:about: | - usage: sc_rna_cluster.R - [-h] --query QUERY [--dimensions [DIMENSIONS [DIMENSIONS ...]]] - [--ametric {euclidean,cosine,manhattan,hamming}] - [--algorithm {louvain,mult-louvain,slm,leiden}] - [--resolution [RESOLUTION [RESOLUTION ...]]] - [--genes [GENES [GENES ...]]] [--diffgenes] [--logfc LOGFC] - [--minpct MINPCT] [--onlypos] - [--testuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] - [--pdf] [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] - [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] - [--cpus CPUS] [--memory MEMORY] + usage: sc_rna_cluster.R [-h] --query QUERY + [--dimensions DIMENSIONS] + [--ametric {euclidean,cosine,manhattan,hamming}] + [--algorithm {louvain,mult-louvain,slm,leiden}] + [--resolution [RESOLUTION [RESOLUTION ...]]] + [--genes [GENES [GENES ...]]] + [--diffgenes] [--logfc LOGFC] + [--minpct MINPCT] [--onlypos] + [--testuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] + [--pdf] [--verbose] [--h5seurat] + [--h5ad] [--cbbuild] [--scope] + [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] Single-cell RNA-Seq Cluster Analysis @@ -790,12 +793,10 @@ s:about: | stored in the RNA assay, as well as 'pca' and 'rnaumap' dimensionality reductions applied to that assay. - --dimensions [DIMENSIONS [DIMENSIONS ...]] + --dimensions DIMENSIONS Dimensionality to use when constructing nearest- - neighbor graph before clustering (from 1 to 50). If - single value N is provided, use from 1 to N - dimensions. If multiple values are provided, subset to - only selected dimensions. Default: from 1 to 10 + neighbor graph before clustering (from 1 to 50). + Default: 10 --ametric {euclidean,cosine,manhattan,hamming} Distance metric used when constructing nearest- neighbor graph before clustering. Default: euclidean @@ -837,6 +838,8 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save Seurat data to h5ad file. Default: false --cbbuild Export results to UCSC Cell Browser. Default: false + --scope Save Seurat data to SCope compatible loom file. + Default: false --output OUTPUT Output prefix. Default: ./sc --theme {gray,bw,linedraw,light,dark,minimal,classic,void} Color theme for all generated plots. Default: classic diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index 2a1f0f18..4e4c379d 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -527,14 +527,18 @@ doc: | s:about: | - usage: sc_rna_da_cells.R - [-h] --query QUERY [--reduction REDUCTION] - [--dimensions [DIMENSIONS [DIMENSIONS ...]]] [--knn [KNN [KNN ...]]] - [--metadata METADATA] --splitby SPLITBY --first FIRST --second SECOND - [--resolution [RESOLUTION [RESOLUTION ...]]] [--ranges RANGES RANGES] - [--pdf] [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] - [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] - [--cpus CPUS] [--memory MEMORY] + usage: sc_rna_da_cells.R [-h] --query QUERY + [--reduction REDUCTION] + [--dimensions DIMENSIONS] + [--knn [KNN [KNN ...]]] + [--metadata METADATA] --splitby + SPLITBY --first FIRST --second SECOND + [--resolution [RESOLUTION [RESOLUTION ...]]] + [--ranges RANGES RANGES] [--pdf] + [--verbose] [--h5seurat] [--h5ad] + [--cbbuild] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] Single-cell Differential Abundance Analysis @@ -549,11 +553,9 @@ s:about: | --reduction REDUCTION Dimensionality reduction to be used for DA analysis. Default: pca - --dimensions [DIMENSIONS [DIMENSIONS ...]] + --dimensions DIMENSIONS Dimensionality to use when running DA analysis (from 1 - to 50). If single value N is provided, use from 1 to N - PCs. If multiple values are provided, subset to only - selected PCs. Default: from 1 to 10 + to 50). Default: 10 --knn [KNN [KNN ...]] Array of k values for kNN graph construction when calculating the score vector for each cell to diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index 745635d4..3544b72f 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -786,21 +786,27 @@ doc: | s:about: | - usage: sc_rna_reduce.R - [-h] --query QUERY [--metadata METADATA] [--barcodes BARCODES] - [--cellcycle CELLCYCLE] [--norm {sct,log,sctglm}] - [--ntgr {seurat,harmony,none}] [--ntgrby [NTGRBY [NTGRBY ...]]] - [--highvargenes HIGHVARGENES] [--regressmt] - [--regressgenes [REGRESSGENES [REGRESSGENES ...]]] - [--regressccfull | --regressccdiff] - [--dimensions [DIMENSIONS [DIMENSIONS ...]]] [--uspread USPREAD] - [--umindist UMINDIST] [--uneighbors UNEIGHBORS] - [--umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis,mahalanobis,wminkowski,seuclidean,cosine,correlation,haversine,hamming,jaccard,dice,russelrao,kulsinski,ll_dirichlet,hellinger,rogerstanimoto,sokalmichener,sokalsneath,yule}] - [--umethod {uwot,uwot-learn,umap-learn}] [--pdf] [--verbose] - [--h5seurat] [--h5ad] [--scope] [--cbbuild] [--lowmem] - [--output OUTPUT] - [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] - [--cpus CPUS] [--memory MEMORY] + usage: sc_rna_reduce.R [-h] --query QUERY [--metadata METADATA] + [--barcodes BARCODES] + [--cellcycle CELLCYCLE] + [--norm {sct,log,sctglm}] + [--ntgr {seurat,harmony,none}] + [--ntgrby [NTGRBY [NTGRBY ...]]] + [--highvargenes HIGHVARGENES] + [--regressmt] + [--regressgenes REGRESSGENES] + [--regressccfull | --regressccdiff] + [--dimensions DIMENSIONS] + [--uspread USPREAD] + [--umindist UMINDIST] + [--uneighbors UNEIGHBORS] + [--umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis,mahalanobis,wminkowski,seuclidean,cosine,correlation,haversine,hamming,jaccard,dice,russelrao,kulsinski,ll_dirichlet,hellinger,rogerstanimoto,sokalmichener,sokalsneath,yule}] + [--umethod {uwot,uwot-learn,umap-learn}] + [--pdf] [--verbose] [--h5seurat] + [--h5ad] [--scope] [--cbbuild] + [--lowmem] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] Single-cell RNA-Seq Dimensionality Reduction Analysis @@ -860,8 +866,8 @@ s:about: | variation. Default: false --regressgenes REGRESSGENES Regex pattern to identify genes which expression - should be regressed as a confounding source of variation. - Default: none + should be regressed as a confounding source of + variation. Default: none --regressccfull Regress all signals associated with cell cycle phase. Ignored if --cellcycle is not provided. Mutually exclusive with --regressccdiff parameter. Default: @@ -871,14 +877,11 @@ s:about: | and cycling cells will be maintained. Ignored if --cellcycle is not provided. Mutually exclusive with --regressccfull Default: false - --dimensions [DIMENSIONS [DIMENSIONS ...]] - Dimensionality to use in UMAP projection (from 1 to - 50). If single value N is provided, use from 1 to N - PCs. If multiple values are provided, subset to only - specified PCs. In combination with --ntgr set to - harmony, multiple values will result in using all - principal components starting from 1 to the max of the - provided values. Default: from 1 to 10 + --dimensions DIMENSIONS + Dimensionality to use for datasets integration (if + provided RDS file includes multiple datasets and + --ntgr is not set to 'harmony') and UMAP projection + (from 1 to 50). Default: 10 --uspread USPREAD The effective scale of embedded points on UMAP. In combination with '--mindist' it determines how clustered/clumped the embedded points are. Default: 1 diff --git a/tools/sc-rna-trajectory.cwl b/tools/sc-rna-trajectory.cwl index 540a5efc..ab5a31e9 100644 --- a/tools/sc-rna-trajectory.cwl +++ b/tools/sc-rna-trajectory.cwl @@ -668,10 +668,10 @@ doc: | s:about: | usage: sc_rna_trajectory.R [-h] --query QUERY [--reduction REDUCTION] - [--dimensions [DIMENSIONS [DIMENSIONS ...]]] - --source SOURCE - [--barcodes BARCODES] + [--dimensions DIMENSIONS] --source + SOURCE [--barcodes BARCODES] [--start START] [--ngenes NGENES] + [--genes [GENES [GENES ...]]] [--pdf] [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] @@ -689,13 +689,9 @@ s:about: | --reduction REDUCTION Dimensionality reduction to be used in the trajectory analysis. Default: pca - --dimensions [DIMENSIONS [DIMENSIONS ...]] - Dimensionality to use (from 1 to 50). If single value - N is provided, use from 1 to N dimensions. If multiple - values are provided, subset to only selected - dimensions. May fail if user specified more dimensions - than it was available in the selected --reduction. - Default: use all available dimensions + --dimensions DIMENSIONS + Dimensionality to use (from 1 to 50). Default: use all + available dimensions --source SOURCE Column from the metadata of the loaded Seurat object to select clusters from --barcodes BARCODES Path to the TSV/CSV file to optionally prefilter and @@ -710,7 +706,8 @@ s:about: | trajectory. Default: defined automatically --ngenes NGENES Number of the most predictive genes to be shows on the gene expression heatmap. Default: 50 - --genes Genes of interest to build genes expression plots. + --genes [GENES [GENES ...]] + Genes of interest to build genes expression plots. Default: None --pdf Export plots in PDF. Default: false --verbose Print debug information. Default: false diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index 59dddfa7..a575ead5 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -176,6 +176,24 @@ inputs: plots will be built. Default: None + cvrg_upstream_bp: + type: int? + inputBinding: + prefix: "--upstream" + doc: | + Number of bases to extend the genome coverage region for + a specific gene upstream. Ignored if --genes or --fragments + parameters are not provided. Default: 2500 + + cvrg_downstream_bp: + type: int? + inputBinding: + prefix: "--downstream" + doc: | + Number of bases to extend the genome coverage region for + a specific gene downstream. Ignored if --genes or --fragments + parameters are not provided. Default: 2500 + identify_diff_genes: type: boolean? inputBinding: @@ -911,23 +929,31 @@ doc: | s:about: | - usage: sc_wnn_cluster.R - [-h] --query QUERY - [--rnadimensions [RNADIMENSIONS [RNADIMENSIONS ...]]] - [--atacdimensions [ATACDIMENSIONS [ATACDIMENSIONS ...]]] - [--algorithm {louvain,mult-louvain,slm,leiden}] [--uspread USPREAD] - [--umindist UMINDIST] [--uneighbors UNEIGHBORS] - [--umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis,mahalanobis,wminkowski,seuclidean,cosine,correlation,haversine,hamming,jaccard,dice,russelrao,kulsinski,ll_dirichlet,hellinger,rogerstanimoto,sokalmichener,sokalsneath,yule}] - [--umethod {uwot,uwot-learn,umap-learn}] - [--resolution [RESOLUTION [RESOLUTION ...]]] [--fragments FRAGMENTS] - [--genes [GENES [GENES ...]]] [--diffgenes] [--diffpeaks] - [--rnalogfc RNALOGFC] [--rnaminpct RNAMINPCT] [--rnaonlypos] - [--rnatestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] - [--ataclogfc ATACLOGFC] [--atacminpct ATACMINPCT] - [--atactestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] - [--pdf] [--verbose] [--h5seurat] [--h5ad] [--cbbuild] [--output OUTPUT] - [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] - [--cpus CPUS] [--memory MEMORY] + usage: sc_wnn_cluster.R [-h] --query QUERY + [--rnadimensions RNADIMENSIONS] + [--atacdimensions ATACDIMENSIONS] + [--algorithm {louvain,mult-louvain,slm,leiden}] + [--uspread USPREAD] + [--umindist UMINDIST] + [--uneighbors UNEIGHBORS] + [--umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis,mahalanobis,wminkowski,seuclidean,cosine,correlation,haversine,hamming,jaccard,dice,russelrao,kulsinski,ll_dirichlet,hellinger,rogerstanimoto,sokalmichener,sokalsneath,yule}] + [--umethod {uwot,uwot-learn,umap-learn}] + [--resolution [RESOLUTION [RESOLUTION ...]]] + [--fragments FRAGMENTS] + [--genes [GENES [GENES ...]]] + [--upstream UPSTREAM] + [--downstream DOWNSTREAM] [--diffgenes] + [--diffpeaks] [--rnalogfc RNALOGFC] + [--rnaminpct RNAMINPCT] [--rnaonlypos] + [--rnatestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] + [--ataclogfc ATACLOGFC] + [--atacminpct ATACMINPCT] + [--atactestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] + [--pdf] [--verbose] [--h5seurat] + [--h5ad] [--cbbuild] [--scope] + [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] Single-cell WNN Cluster Analysis @@ -939,20 +965,17 @@ s:about: | assays correspondingly. Additionally, 'pca', 'rnaumap', 'atac_lsi' and 'atacumap' dimensionality reductions should be present. - --rnadimensions [RNADIMENSIONS [RNADIMENSIONS ...]] + --rnadimensions RNADIMENSIONS Dimensionality from the 'pca' reduction to use when constructing weighted nearest-neighbor graph before - clustering (from 1 to 50). If single value N is - provided, use from 1 to N dimensions. If multiple - values are provided, subset to only selected - dimensions. Default: from 1 to 10 - --atacdimensions [ATACDIMENSIONS [ATACDIMENSIONS ...]] + clustering (from 1 to 50). Default: 10 + --atacdimensions ATACDIMENSIONS Dimensionality from the 'atac_lsi' reduction to use when constructing weighted nearest-neighbor graph - before clustering (from 1 to 50). If single value N is - provided, use from 2 to N dimensions. If multiple - values are provided, subset to only selected - dimensions. Default: from 2 to 10 + before clustering (from 2 to 50). First LSI component + is always excluded unless the provided RDS file + consists of multiple datasets where ATAC assay were + integrated with Harmony. Default: 10 --algorithm {louvain,mult-louvain,slm,leiden} Algorithm for modularity optimization when running clustering. Default: louvain @@ -994,6 +1017,13 @@ s:about: | insertion frequency plots for the nearest peaks. If ' --fragments' is not provided only gene expression plots will be built. Default: None + --upstream UPSTREAM Number of bases to extend the genome coverage region + for a specific gene upstream. Ignored if --genes or + --fragments parameters are not provided. Default: 2500 + --downstream DOWNSTREAM + Number of bases to extend the genome coverage region + for a specific gene downstream. Ignored if --genes or + --fragments parameters are not provided. Default: 2500 --diffgenes Identify differentially expressed genes (putative gene markers) between each pair of clusters for all resolutions. Default: false @@ -1039,6 +1069,9 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save Seurat data to h5ad file. Default: false --cbbuild Export results to UCSC Cell Browser. Default: false + --scope Save Seurat data to SCope compatible loom file. Only + not normalized raw counts from the RNA assay will be + saved. Default: false --output OUTPUT Output prefix. Default: ./sc --theme {gray,bw,linedraw,light,dark,minimal,classic,void} Color theme for all generated plots. Default: classic diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index 8e07b5b4..82144e2b 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -92,6 +92,22 @@ inputs: "Dimensionality reduction" to be used for cluster names assignment. + query_splitby_column: + type: + - "null" + - type: enum + symbols: + - "dataset" + - "condition" + - "none" + default: "none" + label: "Criteria to split every cluster by (optional)" + doc: | + Criteria to split every cluster defined by + the selected dimensionality reduction and + resolution into several groups. + Default: "none" + cell_type_data: type: File label: "Cell types" @@ -650,6 +666,18 @@ steps: query_target_column: source: [query_reduction, query_resolution] valueFrom: $(get_query_column("custom_", self[0], self[1])) + query_splitby_column: + source: query_splitby_column + valueFrom: | + ${ + if (self == "dataset") { + return "new.ident"; + } else if (self == "condition") { + return "condition"; + } else { + return null; + } + } atac_fragments_file: atac_fragments_file genes_of_interest: source: genes_of_interest From 026ce88e2525536f6f36cfb78e21d4e1037852c0 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 12 Jan 2024 02:17:41 -0500 Subject: [PATCH 100/162] Remove gene expression density PNG from the outputs --- workflows/sc-ctype-assign.cwl | 45 ----------------------------------- workflows/sc-rna-cluster.cwl | 15 ------------ workflows/sc-wnn-cluster.cwl | 15 ------------ 3 files changed, 75 deletions(-) diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index 82144e2b..3bd90534 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -500,48 +500,6 @@ outputs: tab: 'Genes of interest' Caption: 'UMAP, gene expression, WNN' - xpr_per_cell_sgnl_rd_rnaumap_plot_png: - type: - - "null" - - type: array - items: File - outputSource: ctype_assign/xpr_per_cell_sgnl_rd_rnaumap_plot_png - label: "UMAP, gene expression density, RNA" - doc: | - UMAP, gene expression density, RNA - 'sd:visualPlugins': - - image: - tab: 'Genes of interest' - Caption: 'UMAP, gene expression density, RNA' - - xpr_per_cell_sgnl_rd_atacumap_plot_png: - type: - - "null" - - type: array - items: File - outputSource: ctype_assign/xpr_per_cell_sgnl_rd_atacumap_plot_png - label: "UMAP, gene expression density, ATAC" - doc: | - UMAP, gene expression density, ATAC - 'sd:visualPlugins': - - image: - tab: 'Genes of interest' - Caption: 'UMAP, gene expression density, ATAC' - - xpr_per_cell_sgnl_rd_wnnumap_plot_png: - type: - - "null" - - type: array - items: File - outputSource: ctype_assign/xpr_per_cell_sgnl_rd_wnnumap_plot_png - label: "UMAP, gene expression density, WNN" - doc: | - UMAP, gene expression density, WNN - 'sd:visualPlugins': - - image: - tab: 'Genes of interest' - Caption: 'UMAP, gene expression density, WNN' - xpr_htmp_plot_png: type: File? outputSource: ctype_assign/xpr_htmp_plot_png @@ -738,9 +696,6 @@ steps: - xpr_per_cell_rd_rnaumap_plot_png - xpr_per_cell_rd_atacumap_plot_png - xpr_per_cell_rd_wnnumap_plot_png - - xpr_per_cell_sgnl_rd_rnaumap_plot_png - - xpr_per_cell_sgnl_rd_atacumap_plot_png - - xpr_per_cell_sgnl_rd_wnnumap_plot_png - cvrg_plot_png - xpr_htmp_plot_png - umap_rd_rnaumap_plot_pdf diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index 1eaa02be..efaef32d 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -359,20 +359,6 @@ outputs: tab: 'Genes of interest' Caption: 'UMAP, gene expression' - xpr_per_cell_sgnl_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_rna_cluster/xpr_per_cell_sgnl_plot_png - label: "UMAP, gene expression density" - doc: | - UMAP, gene expression density - 'sd:visualPlugins': - - image: - tab: 'Genes of interest' - Caption: 'UMAP, gene expression density' - xpr_htmp_res_plot_png: type: - "null" @@ -522,7 +508,6 @@ steps: - cmp_gr_ph_spl_clst_res_plot_png - xpr_avg_res_plot_png - xpr_per_cell_plot_png - - xpr_per_cell_sgnl_plot_png - xpr_dnst_res_plot_png - xpr_htmp_res_plot_png - umap_res_plot_pdf diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index 0a97e5f6..fb6df6d7 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -393,20 +393,6 @@ outputs: tab: 'Genes of interest' Caption: 'UMAP, gene expression' - xpr_per_cell_sgnl_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_wnn_cluster/xpr_per_cell_sgnl_plot_png - label: "UMAP, gene expression density" - doc: | - UMAP, gene expression density - 'sd:visualPlugins': - - image: - tab: 'Genes of interest' - Caption: 'UMAP, gene expression density' - xpr_htmp_res_plot_png: type: - "null" @@ -591,7 +577,6 @@ steps: - cmp_gr_ph_spl_clst_res_plot_png - xpr_avg_res_plot_png - xpr_per_cell_plot_png - - xpr_per_cell_sgnl_plot_png - xpr_dnst_res_plot_png - cvrg_res_plot_png - xpr_htmp_res_plot_png From e6a82adfe7ec7281fe832080dbcfeb70bb19009c Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Sat, 13 Jan 2024 22:48:06 -0500 Subject: [PATCH 101/162] Update RNA reads and ATAC fragments labels --- workflows/sc-atac-cluster.cwl | 10 +-- workflows/sc-atac-coverage.cwl | 14 ++-- workflows/sc-atac-dbinding.cwl | 16 ++-- workflows/sc-atac-reduce.cwl | 12 +-- workflows/sc-ctype-assign.cwl | 10 +-- workflows/sc-multiome-filter.cwl | 140 ++++++++++++++++--------------- workflows/sc-rna-filter.cwl | 72 ++++++++-------- workflows/sc-rna-reduce.cwl | 14 ++-- workflows/sc-wnn-cluster.cwl | 10 +-- 9 files changed, 151 insertions(+), 147 deletions(-) diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index aafe59ef..95b7315e 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -60,7 +60,7 @@ inputs: label: "Cell Ranger ARC Sample (optional)" doc: | "Cell Ranger ARC Sample" for generating - fragments coverage plots over the genes + ATAC fragments coverage plots over the genes of interest. 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" 'sd:localLabel': true @@ -115,7 +115,7 @@ inputs: label: "Genes of interest" doc: | Comma or space separated list of genes - of interest to generate fragments coverage + of interest to generate ATAC fragments coverage plots. Ignored if "Cell Ranger ARC Sample" input is not provided. Default: None @@ -294,13 +294,13 @@ outputs: - type: array items: File outputSource: sc_atac_cluster/cvrg_res_plot_png - label: "Fragments coverage" + label: "ATAC fragments coverage" doc: | - Fragments coverage + ATAC fragments coverage 'sd:visualPlugins': - image: tab: 'Genome coverage' - Caption: 'Fragments coverage' + Caption: 'ATAC fragments coverage' peak_markers_tsv: type: File? diff --git a/workflows/sc-atac-coverage.cwl b/workflows/sc-atac-coverage.cwl index 38214a84..22c662f0 100644 --- a/workflows/sc-atac-coverage.cwl +++ b/workflows/sc-atac-coverage.cwl @@ -124,13 +124,13 @@ inputs: - "5" - "6" default: "1" - label: "Cores/CPUs" + label: "Number of cores/cpus to use" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. Default: 1 - "sd:layout": + 'sd:layout': advanced: true @@ -176,16 +176,16 @@ outputs: - type: array items: File outputSource: sc_atac_coverage/fragments_bigwig_file - label: "Genome coverage for fragments" + label: "Genome coverage for ATAC fragments" doc: | - Genome coverage calculated for fragments + Genome coverage calculated for ATAC fragments in bigWig format 'sd:visualPlugins': - igvbrowser: tab: 'Genome Browser' id: 'igvbrowser' type: 'wig' - name: "Fragments coverage" + name: "ATAC fragments coverage" height: 120 experiment_info: @@ -269,7 +269,7 @@ $schemas: label: "Single-cell ATAC-Seq Genome Coverage" s:name: "Single-cell ATAC-Seq Genome Coverage" -s:alternateName: "Creates genome coverage bigWig files from the provided fragments file and selected grouping parameters" +s:alternateName: "Creates genome coverage bigWig files from the provided ATAC fragments file and selected grouping parameters" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-atac-coverage.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -310,4 +310,4 @@ doc: | Single-cell ATAC-Seq Genome Coverage Creates genome coverage bigWig files from the provided - fragments file and selected grouping parameters \ No newline at end of file + ATAC fragments file and selected grouping parameters \ No newline at end of file diff --git a/workflows/sc-atac-dbinding.cwl b/workflows/sc-atac-dbinding.cwl index cd66d062..64698456 100644 --- a/workflows/sc-atac-dbinding.cwl +++ b/workflows/sc-atac-dbinding.cwl @@ -260,13 +260,13 @@ inputs: - "5" - "6" default: "1" - label: "Cores/CPUs" + label: "Number of cores/cpus to use" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. Default: 1 - "sd:layout": + 'sd:layout': advanced: true @@ -346,10 +346,10 @@ outputs: first_fragments_bigwig_file: type: File outputSource: sc_atac_dbinding/first_fragments_bigwig_file - label: "Genome coverage for fragments (first)" + label: "Genome coverage for ATAC fragments (first)" doc: | Genome coverage in bigWig format calculated - for fragments from the cells that belong to + for ATAC fragments from the cells that belong to the group defined by the --first and --groupby parameters. 'sd:visualPlugins': @@ -357,16 +357,16 @@ outputs: tab: 'Genome Browser' id: 'igvbrowser' type: 'wig' - name: "Fragments coverage (first)" + name: "ATAC fragments coverage (first)" height: 120 second_fragments_bigwig_file: type: File outputSource: sc_atac_dbinding/second_fragments_bigwig_file - label: "Genome coverage for fragments (second)" + label: "Genome coverage for ATAC fragments (second)" doc: | Genome coverage in bigWig format calculated - for fragments from the cells that belong to + for ATAC fragments from the cells that belong to the group defined by the --second and --groupby parameters. 'sd:visualPlugins': @@ -374,7 +374,7 @@ outputs: tab: 'Genome Browser' id: 'igvbrowser' type: 'wig' - name: "Fragments coverage (second)" + name: "ATAC fragments coverage (second)" height: 120 first_tn5ct_bigwig_file: diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index 8ed66b67..a9636cd2 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -275,14 +275,14 @@ outputs: umap_spl_frgm_plot_png: type: File? outputSource: sc_atac_reduce/umap_spl_frgm_plot_png - label: "UMAP, colored by dataset, split by fragments in peaks per cell" + label: "UMAP, colored by dataset, split by ATAC fragments in peaks per cell" doc: | UMAP, colored by dataset, split - by fragments in peaks per cell. + by ATAC fragments in peaks per cell. 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'UMAP, colored by dataset, split by fragments in peaks per cell' + Caption: 'UMAP, colored by dataset, split by ATAC fragments in peaks per cell' umap_spl_peak_plot_png: type: File? @@ -359,14 +359,14 @@ outputs: umap_gr_cnd_spl_frgm_plot_png: type: File? outputSource: sc_atac_reduce/umap_gr_cnd_spl_frgm_plot_png - label: "UMAP, colored by grouping condition, split by fragments in peaks per cell" + label: "UMAP, colored by grouping condition, split by ATAC fragments in peaks per cell" doc: | UMAP, colored by grouping condition, - split by fragments in peaks per cell + split by ATAC fragments in peaks per cell 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'UMAP, colored by grouping condition, split by fragments in peaks per cell' + Caption: 'UMAP, colored by grouping condition, split by ATAC fragments in peaks per cell' umap_gr_cnd_spl_peak_plot_png: type: File? diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index 3bd90534..422c8a4b 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -65,7 +65,7 @@ inputs: label: "Cell Ranger ARC Sample (optional)" doc: | "Cell Ranger ARC Sample" for generating - fragments coverage plots over the genes + ATAC fragments coverage plots over the genes of interest. 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" 'sd:localLabel': true @@ -164,7 +164,7 @@ inputs: doc: | Comma or space separated list of genes of interest to visualize expression and - to generate fragments coverage plots. + to generate ATAC fragments coverage plots. Ignored if "Cell Ranger ARC Sample" input is not provided. Default: None @@ -517,13 +517,13 @@ outputs: - type: array items: File outputSource: ctype_assign/cvrg_plot_png - label: "Fragments coverage" + label: "ATAC fragments coverage" doc: | - Fragments coverage + ATAC fragments coverage 'sd:visualPlugins': - image: tab: 'Genome coverage' - Caption: 'Fragments coverage' + Caption: 'ATAC fragments coverage' xpr_htmp_tsv: type: File? diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index 7aceb235..c35a56d8 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -157,11 +157,11 @@ inputs: minimum_umis: type: string? default: "500" - label: "Minimum number of transcripts per cell" + label: "Minimum number of RNA reads per cell" doc: | Quality control filtering threshold to exclude from the analysis all - cells with the number of transcripts + cells with the number of RNA reads smaller than the provided value. If the selected "Cell Ranger ARC Sample" includes multiple aggregated @@ -245,7 +245,7 @@ inputs: Quality control filtering threshold to exclude from the analysis all cells with the percentage of - transcripts mapped to mitochondrial + RNA reads mapped to mitochondrial genes exceeding the provided value. Default: 5 "sd:layout": @@ -282,11 +282,11 @@ inputs: minimum_fragments: type: string? default: "1000" - label: "Minimum number of fragments in peaks per cell" + label: "Minimum number of ATAC fragments in peaks per cell" doc: | Quality control filtering threshold to exclude from the analysis all - cells with the number of fragments + cells with the number of ATAC fragments in peaks smaller than the provided value. If the selected "Cell Ranger ARC Sample" includes multiple aggregated @@ -494,14 +494,14 @@ outputs: raw_umi_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_umi_dnst_plot_png - label: "Transcripts per cell, raw" + label: "RNA reads per cell, raw" doc: | - Transcripts per cell density + RNA reads per cell density for raw data "sd:visualPlugins": - image: tab: "Raw" - Caption: "Transcripts per cell" + Caption: "RNA reads per cell" raw_gene_dnst_plot_png: type: File? @@ -518,21 +518,21 @@ outputs: raw_gene_umi_plot_png: type: File? outputSource: sc_multiome_filter/raw_gene_umi_plot_png - label: "Genes vs transcripts, raw" + label: "Genes vs RNA reads, raw" doc: | - Genes vs transcripts per cell + Genes vs RNA reads per cell for raw data "sd:visualPlugins": - image: tab: "Raw" - Caption: "Genes vs transcripts" + Caption: "Genes vs RNA reads" raw_mito_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_mito_dnst_plot_png label: "Mitochondrial percentage, raw" doc: | - Percentage of transcripts mapped to + Percentage of RNA reads mapped to mitochondrial genes per cell density for raw data "sd:visualPlugins": @@ -555,14 +555,14 @@ outputs: raw_frgm_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_frgm_dnst_plot_png - label: "Fragments in peaks per cell, raw" + label: "ATAC fragments in peaks per cell, raw" doc: | - Fragments in peaks per cell density - for raw data + ATAC fragments in peaks per + cell density for raw data "sd:visualPlugins": - image: tab: "Raw" - Caption: "Fragments in peaks per cell" + Caption: "ATAC fragments in peaks per cell" raw_peak_dnst_plot_png: type: File? @@ -592,26 +592,26 @@ outputs: raw_rna_atac_cnts_plot_png: type: File? outputSource: sc_multiome_filter/raw_rna_atac_cnts_plot_png - label: "Transcripts vs fragments in peaks, raw" + label: "RNA reads vs ATAC fragments in peaks, raw" doc: | - Transcripts vs fragments in peaks - per cell for raw data + RNA reads vs ATAC fragments in + peaks per cell for raw data "sd:visualPlugins": - image: tab: "Raw" - Caption: "Transcripts vs fragments in peaks" + Caption: "RNA reads vs ATAC fragments in peaks" raw_tss_frgm_plot_png: type: File? outputSource: sc_multiome_filter/raw_tss_frgm_plot_png - label: "TSS enrichment score vs fragments in peaks, raw" + label: "TSS enrichment score vs ATAC fragments in peaks, raw" doc: | - TSS enrichment score vs fragments in peaks - per cell for raw data + TSS enrichment score vs ATAC fragments + in peaks per cell for raw data "sd:visualPlugins": - image: tab: "Raw" - Caption: "TSS enrichment score vs fragments in peaks" + Caption: "TSS enrichment score vs ATAC fragments in peaks" raw_qc_mtrcs_dnst_plot_png: type: File? @@ -676,26 +676,26 @@ outputs: raw_frgm_hist_png: type: File? outputSource: sc_multiome_filter/raw_frgm_hist_png - label: "Fragments length, raw" + label: "ATAC fragments length, raw" doc: | - Fragments length distribution + ATAC fragments length distribution for raw data "sd:visualPlugins": - image: tab: "Raw" - Caption: "Fragments length" + Caption: "ATAC fragments length" raw_umi_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/raw_umi_dnst_spl_cnd_plot_png - label: "Transcripts per cell, raw, split by condition" + label: "RNA reads per cell, raw, split by condition" doc: | - Split by grouping condition transcripts + Split by grouping condition RNA reads per cell density for raw data "sd:visualPlugins": - image: tab: "Raw, by condition" - Caption: "Transcripts per cell" + Caption: "RNA reads per cell" raw_gene_dnst_spl_cnd_plot_png: type: File? @@ -715,7 +715,7 @@ outputs: label: "Mitochondrial percentage, raw, split by condition" doc: | Split by grouping condition the - percentage of transcripts mapped to + percentage of RNA reads mapped to mitochondrial genes per cell density for raw data "sd:visualPlugins": @@ -739,14 +739,15 @@ outputs: raw_frgm_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/raw_frgm_dnst_spl_cnd_plot_png - label: "Fragments in peaks per cell, raw, split by condition" + label: "ATAC fragments in peaks per cell, raw, split by condition" doc: | - Split by grouping condition fragments - in peaks per cell density for raw data + Split by grouping condition ATAC + fragments in peaks per cell density + for raw data "sd:visualPlugins": - image: tab: "Raw, by condition" - Caption: "Fragments in peaks per cell" + Caption: "ATAC fragments in peaks per cell" raw_peak_dnst_spl_cnd_plot_png: type: File? @@ -813,14 +814,14 @@ outputs: fltr_umi_dnst_plot_png: type: File? outputSource: sc_multiome_filter/fltr_umi_dnst_plot_png - label: "Transcripts per cell, filtered" + label: "RNA reads per cell, filtered" doc: | - Transcripts per cell density + RNA reads per cell density for filtered data "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Transcripts per cell" + Caption: "RNA reads per cell" fltr_gene_dnst_plot_png: type: File? @@ -837,21 +838,21 @@ outputs: fltr_gene_umi_plot_png: type: File? outputSource: sc_multiome_filter/fltr_gene_umi_plot_png - label: "Genes vs transcripts, filtered" + label: "Genes vs RNA reads, filtered" doc: | - Genes vs transcripts per cell + Genes vs RNA reads per cell for filtered data "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Genes vs transcripts" + Caption: "Genes vs RNA reads" fltr_mito_dnst_plot_png: type: File? outputSource: sc_multiome_filter/fltr_mito_dnst_plot_png label: "Mitochondrial percentage, filtered" doc: | - Percentage of transcripts mapped to + Percentage of RNA reads mapped to mitochondrial genes per cell density for filtered data "sd:visualPlugins": @@ -874,14 +875,14 @@ outputs: fltr_frgm_dnst_plot_png: type: File? outputSource: sc_multiome_filter/fltr_frgm_dnst_plot_png - label: "Fragments in peaks per cell, filtered" + label: "ATAC fragments in peaks per cell, filtered" doc: | - Fragments in peaks per cell density - for filtered data + ATAC fragments in peaks per cell + density for filtered data "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Fragments in peaks per cell" + Caption: "ATAC fragments in peaks per cell" fltr_peak_dnst_plot_png: type: File? @@ -911,26 +912,27 @@ outputs: fltr_rna_atac_cnts_plot_png: type: File? outputSource: sc_multiome_filter/fltr_rna_atac_cnts_plot_png - label: "Transcripts vs fragments in peaks, filtered" + label: "RNA reads vs ATAC fragments in peaks, filtered" doc: | - Transcripts vs fragments in peaks - per cell for filtered data + RNA reads vs ATAC fragments in + peaks per cell for filtered data "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Transcripts vs fragments in peaks" + Caption: "RNA reads vs ATAC fragments in peaks" fltr_tss_frgm_plot_png: type: File? outputSource: sc_multiome_filter/fltr_tss_frgm_plot_png - label: "TSS enrichment score vs fragments in peaks, filtered" + label: "TSS enrichment score vs ATAC fragments in peaks, filtered" doc: | - TSS enrichment score vs fragments in - peaks per cell for filtered data + TSS enrichment score vs ATAC + fragments in peaks per cell for + filtered data "sd:visualPlugins": - image: tab: "Filtered" - Caption: "TSS enrichment score vs fragments in peaks" + Caption: "TSS enrichment score vs ATAC fragments in peaks" fltr_qc_mtrcs_dnst_plot_png: type: File? @@ -995,26 +997,26 @@ outputs: fltr_frgm_hist_png: type: File? outputSource: sc_multiome_filter/fltr_frgm_hist_png - label: "Fragments length, filtered" + label: "ATAC fragments length, filtered" doc: | - Fragments length distribution + ATAC fragments length distribution for filtered data "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Fragments length" + Caption: "ATAC fragments length" fltr_umi_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/fltr_umi_dnst_spl_cnd_plot_png - label: "Transcripts per cell, filtered, split by condition" + label: "RNA reads per cell, filtered, split by condition" doc: | - Split by grouping condition transcripts + Split by grouping condition RNA reads per cell density for filtered data "sd:visualPlugins": - image: tab: "Filtered, by condition" - Caption: "Transcripts per cell" + Caption: "RNA reads per cell" fltr_gene_dnst_spl_cnd_plot_png: type: File? @@ -1034,7 +1036,7 @@ outputs: label: "Mitochondrial percentage, filtered, split by condition" doc: | Split by grouping condition the - percentage of transcripts mapped to + percentage of RNA reads mapped to mitochondrial genes per cell density for filtered data "sd:visualPlugins": @@ -1058,15 +1060,15 @@ outputs: fltr_frgm_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/fltr_frgm_dnst_spl_cnd_plot_png - label: "Fragments in peaks per cell, filtered, split by condition" + label: "ATAC fragments in peaks per cell, filtered, split by condition" doc: | - Split by grouping condition fragments - in peaks per cell density for filtered - data + Split by grouping condition ATAC + fragments in peaks per cell density + for filtered data "sd:visualPlugins": - image: tab: "Filtered, by condition" - Caption: "Fragments in peaks per cell" + Caption: "ATAC fragments in peaks per cell" fltr_peak_dnst_spl_cnd_plot_png: type: File? @@ -1141,14 +1143,16 @@ outputs: outputSource: sc_multiome_filter/stdout_log label: "Output log, filtering step" doc: | - stdout log generated by sc_multiome_filter step + stdout log generated by + sc_multiome_filter step sc_multiome_filter_stderr_log: type: File outputSource: sc_multiome_filter/stderr_log label: "Error log, filtering step" doc: | - stderr log generated by sc_multiome_filter step + stderr log generated by + sc_multiome_filter step steps: diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index d8d79144..3d6ab961 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -98,9 +98,9 @@ inputs: minimum_umis: type: string? default: "500" - label: "Include cells where at least this many UMI (transcripts) are detected" + label: "Include cells where at least this many RNA reads are detected" doc: | - Include cells where at least this many UMI (transcripts) are detected. + Include cells where at least this many RNA reads are detected. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. Default: 500 (applied to all datasets) @@ -133,9 +133,9 @@ inputs: maximum_mito_perc: type: float? default: 5 - label: "Include cells with the percentage of transcripts mapped to mitochondrial genes not bigger than this value" + label: "Include cells with the percentage of RNA reads mapped to mitochondrial genes not bigger than this value" doc: | - Include cells with the percentage of transcripts mapped to mitochondrial + Include cells with the percentage of RNA reads mapped to mitochondrial genes not bigger than this value. Default: 5 (applied to all datasets) 'sd:layout': @@ -209,13 +209,13 @@ inputs: - "5" - "6" default: "1" - label: "Cores/CPUs" + label: "Number of cores/cpus to use" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. Default: 1 - "sd:layout": + 'sd:layout': advanced: true @@ -260,14 +260,14 @@ outputs: raw_umi_dnst_plot_png: type: File? outputSource: sc_rna_filter/raw_umi_dnst_plot_png - label: "Transcripts per cell density (not filtered)" + label: "RNA reads per cell density (not filtered)" doc: | - Transcripts per cell density (not filtered). + RNA reads per cell density (not filtered). PNG format 'sd:visualPlugins': - image: tab: 'Not filtered QC' - Caption: 'Transcripts per cell density' + Caption: 'RNA reads per cell density' raw_gene_dnst_plot_png: type: File? @@ -284,26 +284,26 @@ outputs: raw_gene_umi_plot_png: type: File? outputSource: sc_rna_filter/raw_gene_umi_plot_png - label: "Genes vs transcripts per cell correlation (not filtered)" + label: "Genes vs RNA reads per cell correlation (not filtered)" doc: | - Genes vs transcripts per cell correlation (not filtered). + Genes vs RNA reads per cell correlation (not filtered). PNG format 'sd:visualPlugins': - image: tab: 'Not filtered QC' - Caption: 'Genes vs transcripts per cell correlation' + Caption: 'Genes vs RNA reads per cell correlation' raw_mito_dnst_plot_png: type: File? outputSource: sc_rna_filter/raw_mito_dnst_plot_png - label: "Percentage of transcripts mapped to mitochondrial genes per cell density (not filtered)" + label: "Percentage of RNA reads mapped to mitochondrial genes per cell density (not filtered)" doc: | - Percentage of transcripts mapped to mitochondrial genes per cell density (not filtered). + Percentage of RNA reads mapped to mitochondrial genes per cell density (not filtered). PNG format 'sd:visualPlugins': - image: tab: 'Not filtered QC' - Caption: 'Percentage of transcripts mapped to mitochondrial genes per cell density' + Caption: 'Percentage of RNA reads mapped to mitochondrial genes per cell density' raw_nvlt_dnst_plot_png: type: File? @@ -344,14 +344,14 @@ outputs: raw_umi_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/raw_umi_dnst_spl_cnd_plot_png - label: "Split by grouping condition transcripts per cell density (not filtered)" + label: "Split by grouping condition RNA reads per cell density (not filtered)" doc: | - Split by grouping condition transcripts per cell density (not filtered). + Split by grouping condition RNA reads per cell density (not filtered). PNG format 'sd:visualPlugins': - image: tab: 'Not filtered QC' - Caption: 'Split by grouping condition transcripts per cell density' + Caption: 'Split by grouping condition RNA reads per cell density' raw_gene_dnst_spl_cnd_plot_png: type: File? @@ -368,15 +368,15 @@ outputs: raw_mito_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/raw_mito_dnst_spl_cnd_plot_png - label: "Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density (not filtered)" + label: "Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density (not filtered)" doc: | - Split by grouping condition the percentage of transcripts mapped + Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density (not filtered). PNG format 'sd:visualPlugins': - image: tab: 'Not filtered QC' - Caption: 'Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density' + Caption: 'Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density' raw_nvlt_dnst_spl_cnd_plot_png: type: File? @@ -429,14 +429,14 @@ outputs: fltr_umi_dnst_plot_png: type: File? outputSource: sc_rna_filter/fltr_umi_dnst_plot_png - label: "Transcripts per cell density (filtered)" + label: "RNA reads per cell density (filtered)" doc: | - Transcripts per cell density (filtered). + RNA reads per cell density (filtered). PNG format 'sd:visualPlugins': - image: tab: 'Filtered QC' - Caption: 'Transcripts per cell density' + Caption: 'RNA reads per cell density' fltr_gene_dnst_plot_png: type: File? @@ -453,26 +453,26 @@ outputs: fltr_gene_umi_plot_png: type: File? outputSource: sc_rna_filter/fltr_gene_umi_plot_png - label: "Genes vs transcripts per cell correlation (filtered)" + label: "Genes vs RNA reads per cell correlation (filtered)" doc: | - Genes vs transcripts per cell correlation (filtered). + Genes vs RNA reads per cell correlation (filtered). PNG format 'sd:visualPlugins': - image: tab: 'Filtered QC' - Caption: 'Genes vs transcripts per cell correlation' + Caption: 'Genes vs RNA reads per cell correlation' fltr_mito_dnst_plot_png: type: File? outputSource: sc_rna_filter/fltr_mito_dnst_plot_png - label: "Percentage of transcripts mapped to mitochondrial genes per cell density (filtered)" + label: "Percentage of RNA reads mapped to mitochondrial genes per cell density (filtered)" doc: | - Percentage of transcripts mapped to mitochondrial genes per cell density (filtered). + Percentage of RNA reads mapped to mitochondrial genes per cell density (filtered). PNG format 'sd:visualPlugins': - image: tab: 'Filtered QC' - Caption: 'Percentage of transcripts mapped to mitochondrial genes per cell density' + Caption: 'Percentage of RNA reads mapped to mitochondrial genes per cell density' fltr_nvlt_dnst_plot_png: type: File? @@ -513,14 +513,14 @@ outputs: fltr_umi_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/fltr_umi_dnst_spl_cnd_plot_png - label: "Split by grouping condition transcripts per cell density (filtered)" + label: "Split by grouping condition RNA reads per cell density (filtered)" doc: | - Split by grouping condition transcripts per cell density (filtered). + Split by grouping condition RNA reads per cell density (filtered). PNG format 'sd:visualPlugins': - image: tab: 'Filtered QC' - Caption: 'Split by grouping condition transcripts per cell density' + Caption: 'Split by grouping condition RNA reads per cell density' fltr_gene_dnst_spl_cnd_plot_png: type: File? @@ -537,15 +537,15 @@ outputs: fltr_mito_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/fltr_mito_dnst_spl_cnd_plot_png - label: "Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density (filtered)" + label: "Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density (filtered)" doc: | - Split by grouping condition the percentage of transcripts mapped + Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density (filtered). PNG format 'sd:visualPlugins': - image: tab: 'Filtered QC' - Caption: 'Split by grouping condition the percentage of transcripts mapped to mitochondrial genes per cell density' + Caption: 'Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density' fltr_nvlt_dnst_spl_cnd_plot_png: type: File? diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index faebb404..e079ed07 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -181,7 +181,7 @@ inputs: label: "Regress mitochondrial percentage" default: false doc: | - Regress the percentage of transcripts + Regress the percentage of RNA reads mapped to mitochondrial genes as a confounding source of variation. Default: false @@ -374,14 +374,14 @@ outputs: umap_spl_umi_plot_png: type: File? outputSource: sc_rna_reduce/umap_spl_umi_plot_png - label: "UMAP, colored by dataset, split by transcripts per cell" + label: "UMAP, colored by dataset, split by RNA reads per cell" doc: | UMAP, colored by dataset, split by - transcripts per cell + RNA reads per cell 'sd:visualPlugins': - image: tab: 'Per dataset' - Caption: 'UMAP, colored by dataset, split by transcripts per cell' + Caption: 'UMAP, colored by dataset, split by RNA reads per cell' umap_spl_gene_plot_png: type: File? @@ -446,14 +446,14 @@ outputs: umap_gr_cnd_spl_umi_plot_png: type: File? outputSource: sc_rna_reduce/umap_gr_cnd_spl_umi_plot_png - label: "UMAP, colored by grouping condition, split by transcripts per cell" + label: "UMAP, colored by grouping condition, split by RNA reads per cell" doc: | UMAP, colored by grouping condition, - split by transcripts per cell + split by RNA reads per cell 'sd:visualPlugins': - image: tab: 'Per group' - Caption: 'UMAP, colored by grouping condition, split by transcripts per cell' + Caption: 'UMAP, colored by grouping condition, split by RNA reads per cell' umap_gr_cnd_spl_gene_plot_png: type: File? diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index fb6df6d7..5d0b3a9d 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -62,7 +62,7 @@ inputs: label: "Cell Ranger ARC Sample (optional)" doc: | "Cell Ranger ARC Sample" for generating - fragments coverage plots over the genes + ATAC fragments coverage plots over the genes of interest. 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" 'sd:localLabel': true @@ -148,7 +148,7 @@ inputs: doc: | Comma or space separated list of genes of interest to visualize expression and - to generate fragments coverage plots. + to generate ATAC fragments coverage plots. Ignored if "Cell Ranger ARC Sample" input is not provided. Default: None @@ -413,13 +413,13 @@ outputs: - type: array items: File outputSource: sc_wnn_cluster/cvrg_res_plot_png - label: "Fragments coverage" + label: "ATAC fragments coverage" doc: | - Fragments coverage + ATAC fragments coverage 'sd:visualPlugins': - image: tab: 'Genome coverage' - Caption: 'Fragments coverage' + Caption: 'ATAC fragments coverage' xpr_htmp_res_tsv: type: From 6f1facfdad976c7e937d2324fccfb9c0614d8fa9 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 15 Jan 2024 19:36:48 -0500 Subject: [PATCH 102/162] Move file input in the sc ctype assign workflow to the bottom --- workflows/sc-ctype-assign.cwl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index 422c8a4b..1339f5a0 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -108,16 +108,6 @@ inputs: resolution into several groups. Default: "none" - cell_type_data: - type: File - label: "Cell types" - doc: | - A TSV/CSV file with the names for each - cluster defined by "Clustering resolution" - and "Dimensionality reduction" parameters. - The file should have two columns named - 'cluster' and 'celltype'. - identify_diff_genes: type: boolean? default: true @@ -169,6 +159,16 @@ inputs: is not provided. Default: None + cell_type_data: + type: File + label: "Cell types" + doc: | + A TSV/CSV file with the names for each + cluster defined by "Clustering resolution" + and "Dimensionality reduction" parameters. + The file should have two columns named + 'cluster' and 'celltype'. + color_theme: type: - "null" From dfdeeb75f77cf3908d701a98d5e21e0bc66a0b15 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 18 Jan 2024 18:42:23 -0500 Subject: [PATCH 103/162] Change sc workflow labels and descriptions --- workflows/cellranger-aggr.cwl | 16 +++++++------ workflows/cellranger-arc-aggr.cwl | 15 ++++++------ workflows/cellranger-arc-count.cwl | 14 +++++++---- workflows/cellranger-atac-aggr.cwl | 12 +++++----- workflows/cellranger-atac-count.cwl | 13 ++++++---- workflows/cellranger-mkref.cwl | 14 +++++++---- workflows/cellranger-mkvdjref.cwl | 13 +++++----- workflows/cellranger-multi.cwl | 18 ++++++++------ workflows/fastq-download.cwl | 7 +++--- workflows/sc-atac-cluster.cwl | 15 +++++++----- workflows/sc-atac-coverage.cwl | 12 +++++----- workflows/sc-atac-dbinding.cwl | 13 +++++----- workflows/sc-atac-reduce.cwl | 18 ++++++++------ workflows/sc-ctype-assign.cwl | 24 +++++++++++++------ workflows/sc-multiome-filter.cwl | 17 +++++++------ workflows/sc-rna-cluster.cwl | 14 ++++++----- workflows/sc-rna-da-cells.cwl | 12 +++++----- workflows/sc-rna-de-pseudobulk.cwl | 14 +++++------ workflows/sc-rna-filter.cwl | 15 +++++++----- workflows/sc-rna-reduce.cwl | 18 ++++++++------ workflows/sc-rna-trajectory.cwl | 12 +++++----- workflows/sc-vdj-profile.cwl | 11 +++++---- workflows/sc-wnn-cluster.cwl | 19 ++++++++------- .../single-cell-preprocess-cellranger.cwl | 15 +++++++----- 24 files changed, 204 insertions(+), 147 deletions(-) diff --git a/workflows/cellranger-aggr.cwl b/workflows/cellranger-aggr.cwl index 0e343304..39a74710 100644 --- a/workflows/cellranger-aggr.cwl +++ b/workflows/cellranger-aggr.cwl @@ -293,9 +293,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cell Ranger Aggregate" -s:name: "Cell Ranger Aggregate" -s:alternateName: "Aggregates data from multiple Cell Ranger Count Gene Expression experiments" +label: "Cell Ranger Aggregate (RNA, RNA+VDJ)" +s:name: "Cell Ranger Aggregate (RNA, RNA+VDJ)" +s:alternateName: "Combines outputs from multiple runs of either Cell Ranger Count (RNA) or Cell Ranger Count (RNA+VDJ) pipelines" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/cellranger-aggr.cwl s:codeRepository: https://github.com/datirium/workflows @@ -333,7 +333,9 @@ s:creator: doc: | - Cell Ranger Aggregate - - Aggregates outputs from multiple runs of Cell Ranger Count Gene Expression or - Cell Ranger Multi Gene Expression and V(D)J Repertoire Profiling experiments + Cell Ranger Aggregate (RNA, RNA+VDJ) + + Combines outputs from multiple runs of either “Cell Ranger Count (RNA)” + or “Cell Ranger Count (RNA+VDJ)” pipelines. The results of this workflow + are primarily used in “Single-Cell RNA-Seq Filtering Analysis” and + “Single-Cell Immune Profiling Analysis” pipelines. diff --git a/workflows/cellranger-arc-aggr.cwl b/workflows/cellranger-arc-aggr.cwl index 3c969b45..ac643f33 100644 --- a/workflows/cellranger-arc-aggr.cwl +++ b/workflows/cellranger-arc-aggr.cwl @@ -356,9 +356,9 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cell Ranger ARC Aggregate Gene Expression and Chromatin Accessibility" -s:name: "Cell Ranger ARC Aggregate Gene Expression and Chromatin Accessibility" -s:alternateName: "Aggregates data from multiple Cell Ranger ARC Count Gene Expression and Chromatin Accessibility experiments" +label: "Cell Ranger Aggregate (RNA+ATAC)" +s:name: "Cell Ranger Aggregate (RNA+ATAC)" +s:alternateName: "Combines outputs from multiple runs of Cell Ranger Count (RNA+ATAC) pipeline" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/cellranger-arc-aggr.cwl s:codeRepository: https://github.com/datirium/workflows @@ -396,7 +396,8 @@ s:creator: doc: | - Cell Ranger ARC Aggregate Gene Expression and Chromatin Accessibility - - Aggregates data from multiple Cell Ranger ARC Count Gene - Expression and Chromatin Accessibility experiments. + Cell Ranger Aggregate (RNA+ATAC) + + Combines outputs from multiple runs of “Cell Ranger Count (RNA+ATAC)” + pipeline. The results of this workflow are primarily used in + “Single-Cell Multiome ATAC and RNA-Seq Filtering Analysis” pipeline. diff --git a/workflows/cellranger-arc-count.cwl b/workflows/cellranger-arc-count.cwl index d91590e0..022f0b16 100644 --- a/workflows/cellranger-arc-count.cwl +++ b/workflows/cellranger-arc-count.cwl @@ -632,9 +632,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -s:name: "Cell Ranger ARC Count Gene Expression and Chromatin Accessibility" -label: "Cell Ranger ARC Count Gene Expression and Chromatin Accessibility" -s:alternateName: "Counts gene expression and chromatin accessibility for a single library" +s:name: "Cell Ranger Count (RNA+ATAC)" +label: "Cell Ranger Count (RNA+ATAC)" +s:alternateName: "Quantifies single-cell gene expression and chromatin accessibility of the sequencing data from a single 10x Genomics library in a combined manner" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/cellranger-arc-count.cwl s:codeRepository: https://github.com/datirium/workflows @@ -672,6 +672,10 @@ s:creator: doc: | - Cell Ranger ARC Count Gene Expression and Chromatin Accessibility + Cell Ranger Count (RNA+ATAC) - Counts gene expression and chromatin accessibility for a single library \ No newline at end of file + Quantifies single-cell gene expression and chromatin accessibility + of the sequencing data from a single 10x Genomics library in a + combined manner. The results of this workflow are primarily used in + either “Single-Cell Multiome ATAC and RNA-Seq Filtering Analysis” + or “Cell Ranger Aggregate (RNA+ATAC)” pipelines. \ No newline at end of file diff --git a/workflows/cellranger-atac-aggr.cwl b/workflows/cellranger-atac-aggr.cwl index 80aaa8a5..7036680a 100644 --- a/workflows/cellranger-atac-aggr.cwl +++ b/workflows/cellranger-atac-aggr.cwl @@ -316,9 +316,9 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cellranger ATAC Aggregate" -s:name: "Cellranger ATAC Aggregate" -s:alternateName: "Aggregates outputs from multiple runs of Cell Ranger Count Chromatin Accessibility experiments" +label: "Cellranger Aggregate (ATAC)" +s:name: "Cellranger Aggregate (ATAC)" +s:alternateName: "Combines outputs from multiple runs of Cell Ranger Count (ATAC) pipeline" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/cellranger-atac-aggr.cwl s:codeRepository: https://github.com/datirium/workflows @@ -356,7 +356,7 @@ s:creator: doc: | - Cellranger ATAC Aggregate + Cellranger Aggregate (ATAC) - Aggregates outputs from multiple runs of Cell Ranger Count Chromatin - Accessibility experiments + Combines outputs from multiple runs of “Cell Ranger + Count (ATAC)” pipeline. diff --git a/workflows/cellranger-atac-count.cwl b/workflows/cellranger-atac-count.cwl index fd4f87cb..a9569e2e 100644 --- a/workflows/cellranger-atac-count.cwl +++ b/workflows/cellranger-atac-count.cwl @@ -476,9 +476,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cell Ranger ATAC Count" -s:name: "Cell Ranger ATAC Count" -s:alternateName: "Counts reads from a single scATAC-Seq library" +label: "Cell Ranger Count (ATAC)" +s:name: "Cell Ranger Count (ATAC)" +s:alternateName: "Quantifies single-cell chromatin accessibility of the sequencing data from a single 10x Genomics library" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/cellranger-atac-count.cwl s:codeRepository: https://github.com/datirium/workflows @@ -516,6 +516,9 @@ s:creator: doc: | - Cell Ranger ATAC Count + Cell Ranger Count (ATAC) - Counts reads from a single scATAC-Seq library \ No newline at end of file + Quantifies single-cell chromatin accessibility of the sequencing + data from a single 10x Genomics library. The results of this + workflow are primarily used in “Cellranger Aggregate (ATAC)” + pipeline. \ No newline at end of file diff --git a/workflows/cellranger-mkref.cwl b/workflows/cellranger-mkref.cwl index b6ccb189..297f7eb3 100644 --- a/workflows/cellranger-mkref.cwl +++ b/workflows/cellranger-mkref.cwl @@ -194,9 +194,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -s:name: "Cell Ranger Build Reference Indices" -label: "Cell Ranger Build Reference Indices" -s:alternateName: "Builds reference genome indices for Cell Ranger Gene Expression and Cell Ranger Multiome ATAC + Gene Expression experiments" +s:name: "Cell Ranger Reference (RNA, ATAC, RNA+ATAC)" +label: "Cell Ranger Reference (RNA, ATAC, RNA+ATAC)" +s:alternateName: "Builds a reference genome of a selected species for quantifying gene expression and chromatin accessibility" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/cellranger-mkref.cwl s:codeRepository: https://github.com/datirium/workflows @@ -234,5 +234,9 @@ s:creator: doc: | - Cell Ranger Build Reference Indices - =================================== \ No newline at end of file + Cell Ranger Reference (RNA, ATAC, RNA+ATAC) + + Builds a reference genome of a selected species for quantifying + gene expression and chromatin accessibility. The results of this + workflow are used in all “Cell Ranger Count” and “Cell Ranger + Aggregate” pipelines. \ No newline at end of file diff --git a/workflows/cellranger-mkvdjref.cwl b/workflows/cellranger-mkvdjref.cwl index 3513635d..f9d23f7d 100644 --- a/workflows/cellranger-mkvdjref.cwl +++ b/workflows/cellranger-mkvdjref.cwl @@ -98,9 +98,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cell Ranger Build V(D)J Reference Indices" -s:name: "Cell Ranger Build V(D)J Reference Indices" -s:alternateName: "Build a Cell Ranger V(D)J-compatible reference folder from a user-supplied genome FASTA and gene GTF files" +label: "Cell Ranger Reference (VDJ)" +s:name: "Cell Ranger Reference (VDJ)" +s:alternateName: "Builds a reference genome of a selected species for V(D)J contigs assembly and clonotype calling" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/cellranger-mkvdjref.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -138,7 +138,8 @@ s:creator: doc: | - Cell Ranger Build V(D)J Reference Indices + Cell Ranger Reference (VDJ) - Build a Cell Ranger V(D)J-compatible reference folder from - a user-supplied genome FASTA and gene GTF files. \ No newline at end of file + Builds a reference genome of a selected species for V(D)J + contigs assembly and clonotype calling. The results of this + workflow are used in “Cell Ranger Count (RNA+VDJ)” pipeline. \ No newline at end of file diff --git a/workflows/cellranger-multi.cwl b/workflows/cellranger-multi.cwl index dd7c2142..cf3c5b52 100644 --- a/workflows/cellranger-multi.cwl +++ b/workflows/cellranger-multi.cwl @@ -630,9 +630,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cell Ranger Multi Gene Expression and V(D)J Repertoire Profiling" -s:name: "Cell Ranger Multi Gene Expression and V(D)J Repertoire Profiling" -s:alternateName: "Quantifies gene expression and performs profiling of V(D)J repertoire from a single GEM well" +label: "Cell Ranger Count (RNA+VDJ)" +s:name: "Cell Ranger Count (RNA+VDJ)" +s:alternateName: "Quantifies single-cell gene expression, performs V(D)J contigs assembly and clonotype calling of the sequencing data from a single 10x Genomics library in a combined manner" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/cellranger-multi.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -670,7 +670,11 @@ s:creator: doc: | - Cell Ranger Multi Gene Expression and V(D)J Repertoire Profiling - - Quantifies gene expression and performs profiling of V(D)J - repertoire from a single GEM well \ No newline at end of file + Cell Ranger Count (RNA+VDJ) + + Quantifies single-cell gene expression, performs V(D)J contigs + assembly and clonotype calling of the sequencing data from a + single 10x Genomics library in a combined manner. The results + of this workflow are primarily used in either “Single-Cell + RNA-Seq Filtering Analysis”, “Single-Cell Immune Profiling Analysis”, + or “Cell Ranger Aggregate (RNA, RNA+VDJ)” pipelines. \ No newline at end of file diff --git a/workflows/fastq-download.cwl b/workflows/fastq-download.cwl index 01286ca3..3a3c617a 100644 --- a/workflows/fastq-download.cwl +++ b/workflows/fastq-download.cwl @@ -231,7 +231,7 @@ $schemas: label: "FASTQ Download" s:name: "FASTQ Download" -s:alternateName: "Downloads FASTQ files from the provided SRR identifiers" +s:alternateName: "Assists in downloading problematic single-cell sequencing data from Sequence Read Archive (SRA)" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/fastq-download.cwl s:codeRepository: https://github.com/datirium/workflows @@ -270,5 +270,6 @@ s:creator: doc: | FASTQ Download - - Downloads FASTQ files from the provided SRR identifiers + + Assists in downloading problematic single-cell sequencing + data from Sequence Read Archive (SRA) diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index 95b7315e..aeb64e0b 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -462,9 +462,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell ATAC-Seq Cluster Analysis" -s:name: "Single-cell ATAC-Seq Cluster Analysis" -s:alternateName: "Clusters single-cell ATAC-Seq datasets, identifies differentially accessible peaks" +label: "Single-Cell ATAC-Seq Cluster Analysis" +s:name: "Single-Cell ATAC-Seq Cluster Analysis" +s:alternateName: "Clusters cells by similarity of chromatin accessibility data" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-atac-cluster.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -502,7 +502,10 @@ s:creator: doc: | - Single-cell ATAC-Seq Cluster Analysis + Single-Cell ATAC-Seq Cluster Analysis - Clusters single-cell ATAC-Seq datasets, identifies - differentially accessible peaks. \ No newline at end of file + Clusters cells by similarity of chromatin accessibility data + from the outputs of “Single-Cell ATAC-Seq Dimensionality + Reduction Analysis” pipeline. The results of this workflow are + primarily used in “Single-Cell Manual Cell Type Assignment” + pipeline. \ No newline at end of file diff --git a/workflows/sc-atac-coverage.cwl b/workflows/sc-atac-coverage.cwl index 22c662f0..c71350fe 100644 --- a/workflows/sc-atac-coverage.cwl +++ b/workflows/sc-atac-coverage.cwl @@ -267,9 +267,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell ATAC-Seq Genome Coverage" -s:name: "Single-cell ATAC-Seq Genome Coverage" -s:alternateName: "Creates genome coverage bigWig files from the provided ATAC fragments file and selected grouping parameters" +label: "Single-Cell ATAC-Seq Genome Coverage" +s:name: "Single-Cell ATAC-Seq Genome Coverage" +s:alternateName: "Generates genome coverage tracks from chromatin accessibility data of selected cells" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-atac-coverage.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -307,7 +307,7 @@ s:creator: doc: | - Single-cell ATAC-Seq Genome Coverage + Single-Cell ATAC-Seq Genome Coverage - Creates genome coverage bigWig files from the provided - ATAC fragments file and selected grouping parameters \ No newline at end of file + Generates genome coverage tracks from chromatin + accessibility data of selected cells \ No newline at end of file diff --git a/workflows/sc-atac-dbinding.cwl b/workflows/sc-atac-dbinding.cwl index 64698456..a4233a7e 100644 --- a/workflows/sc-atac-dbinding.cwl +++ b/workflows/sc-atac-dbinding.cwl @@ -925,9 +925,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell ATAC-Seq Differential Binding Analysis" -s:name: "Single-cell ATAC-Seq Differential Binding Analysis" -s:alternateName: "Identifies differential bound sites between two groups of cells" +label: "Single-Cell ATAC-Seq Differential Binding Analysis" +s:name: "Single-Cell ATAC-Seq Differential Binding Analysis" +s:alternateName: "Identifies differentially bound sites between any two groups of cells" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-atac-dbinding.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -965,7 +965,8 @@ s:creator: doc: | - Single-cell ATAC-Seq Differential Binding Analysis + Single-Cell ATAC-Seq Differential Binding Analysis - Identifies differential bound sites between two - groups of cells \ No newline at end of file + Identifies differentially bound sites between any two + groups of cells, optionally aggregating chromatin + accessibility data from single-cell to pseudobulk form. \ No newline at end of file diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index a9636cd2..f24ffbbc 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -602,9 +602,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell ATAC-Seq Dimensionality Reduction Analysis" -s:name: "Single-cell ATAC-Seq Dimensionality Reduction Analysis" -s:alternateName: "Integrates multiple single-cell ATAC-Seq datasets, reduces dimensionality using LSI" +label: "Single-Cell ATAC-Seq Dimensionality Reduction Analysis" +s:name: "Single-Cell ATAC-Seq Dimensionality Reduction Analysis" +s:alternateName: "Removes noise and confounding sources of variation by reducing dimensionality of chromatin accessibility data" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-atac-reduce.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -642,7 +642,11 @@ s:creator: doc: | - Single-cell ATAC-Seq Dimensionality Reduction Analysis - - Integrates multiple single-cell ATAC-Seq datasets, - reduces dimensionality using LSI. \ No newline at end of file + Single-Cell ATAC-Seq Dimensionality Reduction Analysis + + Removes noise and confounding sources of variation by reducing + dimensionality of chromatin accessibility data from the outputs + of “Single-Cell Multiome ATAC and RNA-Seq Filtering Analysis” + pipelines. The results of this workflow are primarily used in + “Single-Cell ATAC-Seq Cluster Analysis” or “Single-Cell WNN + Cluster Analysis” pipelines. \ No newline at end of file diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index 1339f5a0..422df642 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -789,9 +789,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell Manual Cell Type Assignment" -s:name: "Single-cell Manual Cell Type Assignment" -s:alternateName: "Assigns cell types for clusters based on the provided metadata file" +label: "Single-Cell Manual Cell Type Assignment" +s:name: "Single-Cell Manual Cell Type Assignment" +s:alternateName: "Assigns identities to clustered cells" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-ctype-assign.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -829,7 +829,17 @@ s:creator: doc: | - Single-cell Manual Cell Type Assignment - - Assigns cell types for clusters based on - the provided metadata file. \ No newline at end of file + Single-Cell Manual Cell Type Assignment + + Assigns identities to cells clustered with any of the “Single-Cell + Cluster Analysis” pipelines. For “Single-Cell RNA-Seq Cluster Analysis” + the results of this workflow are primarily used in “Single-Cell + RNA-Seq Differential Expression Analysis”, “Single-Cell RNA-Seq + Trajectory Analysis”, and, when combined with outputs from “Cell Ranger + Count (RNA+VDJ)” or “Cell Ranger Aggregate (RNA, RNA+VDJ)” workflows – + in “Single-Cell Immune Profiling Analysis” pipelines. For “Single-Cell + ATAC-Seq Cluster Analysis” the results of this workflow are primarily + used in “Single-Cell ATAC-Seq Differential Binding Analysis” and + “Single-Cell ATAC-Seq Genome Coverage” pipelines. For “Single-Cell WNN + Cluster Analysis” – in all of the above, except the “Single-Cell + Immune Profiling Analysis” workflow. \ No newline at end of file diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index c35a56d8..31cbf2f2 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -1445,9 +1445,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell Multiome ATAC and RNA-Seq Filtering Analysis" -s:name: "Single-cell Multiome ATAC and RNA-Seq Filtering Analysis" -s:alternateName: "Filters single-cell multiome ATAC and RNA-Seq datasets based on the multiple QC metrics" +label: "Single-Cell Multiome ATAC and RNA-Seq Filtering Analysis" +s:name: "Single-Cell Multiome ATAC and RNA-Seq Filtering Analysis" +s:alternateName: "Removes low-quality cells" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-multiome-filter.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -1485,7 +1485,10 @@ s:creator: doc: | - Single-cell Multiome ATAC and RNA-Seq Filtering Analysis - - Filters single-cell multiome ATAC and RNA-Seq datasets - based on the multiple QC metrics. \ No newline at end of file + Single-Cell Multiome ATAC and RNA-Seq Filtering Analysis + + Removes low-quality cells from the outputs of “Cell Ranger Count + (RNA+ATAC)” and “Cell Ranger Aggregate (RNA+ATAC)” pipelines. The + results of this workflow are primarily used in “Single-Cell RNA-Seq + Dimensionality Reduction Analysis” and “Single-Cell ATAC-Seq + Dimensionality Reduction Analysis” pipelines. \ No newline at end of file diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index efaef32d..b5a617bb 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -576,9 +576,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell RNA-Seq Cluster Analysis" -s:name: "Single-cell RNA-Seq Cluster Analysis" -s:alternateName: "Clusters single-cell RNA-Seq datasets, identifies gene markers" +label: "Single-Cell RNA-Seq Cluster Analysis" +s:name: "Single-Cell RNA-Seq Cluster Analysis" +s:alternateName: "Clusters cells by similarity of gene expression data" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-cluster.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -616,7 +616,9 @@ s:creator: doc: | - Single-cell RNA-Seq Cluster Analysis + Single-Cell RNA-Seq Cluster Analysis - Clusters single-cell RNA-Seq datasets, - identifies gene markers. \ No newline at end of file + Clusters cells by similarity of gene expression data from + the outputs of “Single-Cell RNA-Seq Dimensionality Reduction + Analysis” pipeline. The results of this workflow are primarily + used in “Single-Cell Manual Cell Type Assignment” pipeline. \ No newline at end of file diff --git a/workflows/sc-rna-da-cells.cwl b/workflows/sc-rna-da-cells.cwl index bb5662de..2bb2d5d0 100644 --- a/workflows/sc-rna-da-cells.cwl +++ b/workflows/sc-rna-da-cells.cwl @@ -436,9 +436,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell Differential Abundance Analysis" -s:name: "Single-cell Differential Abundance Analysis" -s:alternateName: "Detects cell subpopulations with differential abundance between datasets split by biological condition" +label: "Single-Cell Differential Abundance Analysis" +s:name: "Single-Cell Differential Abundance Analysis" +s:alternateName: "Compares the composition of cell types between two tested conditions" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-da-cells.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -476,7 +476,7 @@ s:creator: doc: | - Single-cell Differential Abundance Analysis + Single-Cell Differential Abundance Analysis - Detects cell subpopulations with differential abundance - between datasets split by biological condition. \ No newline at end of file + Compares the composition of cell types between + two tested conditions \ No newline at end of file diff --git a/workflows/sc-rna-de-pseudobulk.cwl b/workflows/sc-rna-de-pseudobulk.cwl index a51f1252..4a7e503e 100644 --- a/workflows/sc-rna-de-pseudobulk.cwl +++ b/workflows/sc-rna-de-pseudobulk.cwl @@ -744,9 +744,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell RNA-Seq Differential Expression Analysis" -s:name: "Single-cell RNA-Seq Differential Expression Analysis" -s:alternateName: "Identifies differentially expressed genes between groups of cells optionally coerced to the pseudobulk form" +label: "Single-Cell RNA-Seq Differential Expression Analysis" +s:name: "Single-Cell RNA-Seq Differential Expression Analysis" +s:alternateName: "Identifies differentially expressed genes between any two groups of cells" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-de-pseudobulk.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -784,8 +784,8 @@ s:creator: doc: | - Single-cell RNA-Seq Differential Expression Analysis + Single-Cell RNA-Seq Differential Expression Analysis - Identifies differentially expressed genes - between groups of cells optionally coerced - to the pseudobulk form. \ No newline at end of file + Identifies differentially expressed genes between any two + groups of cells, optionally aggregating gene expression + data from single-cell to pseudobulk form. \ No newline at end of file diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index 3d6ab961..933e3633 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -789,9 +789,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell RNA-Seq Filtering Analysis" -s:name: "Single-cell RNA-Seq Filtering Analysis" -s:alternateName: "Filters single-cell RNA-Seq datasets based on the common QC metrics" +label: "Single-Cell RNA-Seq Filtering Analysis" +s:name: "Single-Cell RNA-Seq Filtering Analysis" +s:alternateName: "Removes low-quality cells" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-filter.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -829,6 +829,9 @@ s:creator: doc: | - Single-cell RNA-Seq Filtering Analysis - - Filters single-cell RNA-Seq datasets based on the common QC metrics. \ No newline at end of file + Single-Cell RNA-Seq Filtering Analysis + + Removes low-quality cells from the outputs of “Cell Ranger Count (RNA)”, + “Cell Ranger Count (RNA+VDJ)”, and “Cell Ranger Aggregate (RNA, RNA+VDJ)” + pipelines. The results of this workflow are primarily used in “Single-Cell + RNA-Seq Dimensionality Reduction Analysis” pipeline. \ No newline at end of file diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index e079ed07..023a84be 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -703,9 +703,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell RNA-Seq Dimensionality Reduction Analysis" -s:name: "Single-cell RNA-Seq Dimensionality Reduction Analysis" -s:alternateName: "Integrates multiple single-cell RNA-Seq datasets, reduces dimensionality using PCA" +label: "Single-Cell RNA-Seq Dimensionality Reduction Analysis" +s:name: "Single-Cell RNA-Seq Dimensionality Reduction Analysis" +s:alternateName: "Removes noise and confounding sources of variation by reducing dimensionality of gene expression data" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-reduce.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -743,7 +743,11 @@ s:creator: doc: | - Single-cell RNA-Seq Dimensionality Reduction Analysis - - Integrates multiple single-cell RNA-Seq datasets, - reduces dimensionality using PCA. \ No newline at end of file + Single-Cell RNA-Seq Dimensionality Reduction Analysis + + Removes noise and confounding sources of variation by reducing + dimensionality of gene expression data from the outputs of + “Single-Cell RNA-Seq Filtering Analysis” or “Single-Cell Multiome + ATAC and RNA-Seq Filtering Analysis” pipelines. The results of + this workflow are primarily used in “Single-Cell RNA-Seq Cluster + Analysis” or “Single-Cell WNN Cluster Analysis” pipelines. \ No newline at end of file diff --git a/workflows/sc-rna-trajectory.cwl b/workflows/sc-rna-trajectory.cwl index 4cebe253..7db2d395 100644 --- a/workflows/sc-rna-trajectory.cwl +++ b/workflows/sc-rna-trajectory.cwl @@ -598,9 +598,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell RNA-Seq Trajectory Analysis" -s:name: "Single-cell RNA-Seq Trajectory Analysis" -s:alternateName: "Aligns cells along the trajectory defined based on PCA or other dimensionality reduction" +label: "Single-Cell RNA-Seq Trajectory Analysis" +s:name: "Single-Cell RNA-Seq Trajectory Analysis" +s:alternateName: "Infers developmental trajectories and pseudotime from cells clustered by similarity of gene expression data" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-trajectory.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -638,7 +638,7 @@ s:creator: doc: | - Single-cell RNA-Seq Trajectory Analysis + Single-Cell RNA-Seq Trajectory Analysis - Aligns cells along the trajectory defined based - on PCA or other dimensionality reduction \ No newline at end of file + Infers developmental trajectories and pseudotime from + cells clustered by similarity of gene expression data. \ No newline at end of file diff --git a/workflows/sc-vdj-profile.cwl b/workflows/sc-vdj-profile.cwl index 84e96dd2..2b241086 100644 --- a/workflows/sc-vdj-profile.cwl +++ b/workflows/sc-vdj-profile.cwl @@ -631,9 +631,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell Immune Profiling Analysis" -s:name: "Single-cell Immune Profiling Analysis" -s:alternateName: "TCR/BCR clonotype dynamics analysis" +label: "Single-Cell Immune Profiling Analysis" +s:name: "Single-Cell Immune Profiling Analysis" +s:alternateName: "Estimates clonotype diversity and dynamics from V(D)J sequencing data assembled into contigs" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-vdj-profile.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -671,6 +671,7 @@ s:creator: doc: | - Single-cell Immune Profiling Analysis + Single-Cell Immune Profiling Analysis - TCR/BCR clonotype dynamics analysis \ No newline at end of file + Estimates clonotype diversity and dynamics from V(D)J + sequencing data assembled into contigs \ No newline at end of file diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index 5d0b3a9d..06d53bff 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -647,9 +647,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell WNN Cluster Analysis" -s:name: "Single-cell WNN Cluster Analysis" -s:alternateName: "Clusters multiome ATAC and RNA-Seq datasets, identifies gene markers and differentially accessible peaks" +label: "Single-Cell WNN Cluster Analysis" +s:name: "Single-Cell WNN Cluster Analysis" +s:alternateName: "Clusters cells by similarity based on both gene expression and chromatin accessibility data" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-wnn-cluster.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -687,8 +687,11 @@ s:creator: doc: | - Single-cell WNN Cluster Analysis - - Clusters multiome ATAC and RNA-Seq datasets, - identifies gene markers and differentially - accessible peaks. \ No newline at end of file + Single-Cell WNN Cluster Analysis + + Clusters cells by similarity based on both gene expression and + chromatin accessibility data from the outputs of “Single-Cell + RNA-Seq Dimensionality Reduction Analysis” and “Single-Cell + ATAC-Seq Dimensionality Reduction Analysis” pipelines run + sequentially. The results of this workflow are primarily used + in “Single-Cell Manual Cell Type Assignment” pipeline. \ No newline at end of file diff --git a/workflows/single-cell-preprocess-cellranger.cwl b/workflows/single-cell-preprocess-cellranger.cwl index a81326ee..5ba70504 100644 --- a/workflows/single-cell-preprocess-cellranger.cwl +++ b/workflows/single-cell-preprocess-cellranger.cwl @@ -400,9 +400,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -s:name: "Cell Ranger Count Gene Expression" -label: "Cell Ranger Count Gene Expression" -s:alternateName: "Counts gene expression for a single library" +s:name: "Cell Ranger Count (RNA)" +label: "Cell Ranger Count (RNA)" +s:alternateName: "Quantifies single-cell gene expression of the sequencing data from a single 10x Genomics library" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/single-cell-preprocess-cellranger.cwl s:codeRepository: https://github.com/datirium/workflows @@ -440,6 +440,9 @@ s:creator: doc: | - Cell Ranger Count Gene Expression - - Quantifies gene expression from a single-cell RNA-Seq library. \ No newline at end of file + Cell Ranger Count (RNA) + + Quantifies single-cell gene expression of the sequencing data + from a single 10x Genomics library. The results of this workflow + are primarily used in either “Single-Cell RNA-Seq Filtering + Analysis” or “Cell Ranger Aggregate (RNA, RNA+VDJ)” pipelines. \ No newline at end of file From fe8b2b0ad469612027b53b7f156761c831c77a8c Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 18 Jan 2024 18:58:38 -0500 Subject: [PATCH 104/162] Not important changes --- workflows/sc-atac-cluster.cwl | 2 +- workflows/sc-ctype-assign.cwl | 6 +++--- workflows/sc-rna-cluster.cwl | 2 +- workflows/sc-rna-da-cells.cwl | 2 +- workflows/sc-rna-trajectory.cwl | 6 +++--- workflows/sc-vdj-profile.cwl | 4 ++-- workflows/sc-wnn-cluster.cwl | 4 ++-- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index aeb64e0b..8d4a0025 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -47,7 +47,7 @@ inputs: Analysis that includes single-cell multiome RNA and ATAC-Seq or just ATAC-Seq datasets run through - "Single-cell ATAC-Seq Dimensionality + "Single-Cell ATAC-Seq Dimensionality Reduction Analysis" at any of the processing stages. 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index 422df642..c9be7fc8 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -51,9 +51,9 @@ inputs: Analysis that includes clustered single-cell data and was run through at least one of the following workflows: - "Single-cell RNA-Seq Cluster Analysis", - "Single-cell ATAC-Seq Cluster Analysis", - "Single-cell WNN Cluster Analysis", - + "Single-Cell RNA-Seq Cluster Analysis", + "Single-Cell ATAC-Seq Cluster Analysis", + "Single-Cell WNN Cluster Analysis", - at any of the processing stages. 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" 'sd:localLabel': true diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index b5a617bb..b13806e0 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -44,7 +44,7 @@ inputs: Analysis that includes single-cell multiome RNA and ATAC-Seq or just RNA-Seq datasets run through - "Single-cell RNA-Seq Dimensionality + "Single-Cell RNA-Seq Dimensionality Reduction Analysis" at any of the processing stages. 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" diff --git a/workflows/sc-rna-da-cells.cwl b/workflows/sc-rna-da-cells.cwl index 2bb2d5d0..9bb5a26a 100644 --- a/workflows/sc-rna-da-cells.cwl +++ b/workflows/sc-rna-da-cells.cwl @@ -35,7 +35,7 @@ inputs: query_data_rds: type: File - label: "Experiment run through Single-cell RNA-Seq Dimensionality Reduction Analysis" + label: "Experiment run through Single-Cell RNA-Seq Dimensionality Reduction Analysis" doc: | Path to the RDS file to load Seurat object from. This file should include genes expression information stored in the RNA assay and selected with the --reduction diff --git a/workflows/sc-rna-trajectory.cwl b/workflows/sc-rna-trajectory.cwl index 7db2d395..50f96b25 100644 --- a/workflows/sc-rna-trajectory.cwl +++ b/workflows/sc-rna-trajectory.cwl @@ -40,11 +40,11 @@ inputs: Analysis that includes single-cell multiome RNA and ATAC-Seq or just RNA-Seq datasets run through either - "Single-cell Manual Cell Type + "Single-Cell Manual Cell Type Assignment" (based on the RNA or WNN - clustering results), "Single-cell + clustering results), "Single-Cell RNA-Seq Cluster Analysis", or - "Single-cell WNN Cluster Analysis" + "Single-Cell WNN Cluster Analysis" at any of the processing stages. 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" 'sd:localLabel': true diff --git a/workflows/sc-vdj-profile.cwl b/workflows/sc-vdj-profile.cwl index 2b241086..56cc802e 100644 --- a/workflows/sc-vdj-profile.cwl +++ b/workflows/sc-vdj-profile.cwl @@ -32,8 +32,8 @@ inputs: doc: | Analysis that includes single-cell RNA-Seq datasets run through either - "Single-cell Manual Cell Type - Assignment" or "Single-cell RNA-Seq + "Single-Cell Manual Cell Type + Assignment" or "Single-Cell RNA-Seq Cluster Analysis" at any of the processing stages. 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index 06d53bff..0355033c 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -47,9 +47,9 @@ inputs: doc: | Analysis that includes single-cell multiome RNA and ATAC-Seq datasets - run through both "Single-cell + run through both "Single-Cell RNA-Seq Dimensionality Reduction - Analysis" and "Single-cell ATAC-Seq + Analysis" and "Single-Cell ATAC-Seq Dimensionality Reduction Analysis" at any of the processing stages. 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" From fb46905d62dee91f230afe46f2ac59d857b9d069 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 19 Jan 2024 15:01:01 -0500 Subject: [PATCH 105/162] Update all "transcripts" to "RNA reads" and all "Fragments" to "ATAC fragments" --- tools/sc-atac-cluster.cwl | 6 +- tools/sc-atac-coverage.cwl | 8 +- tools/sc-atac-dbinding.cwl | 6 +- tools/sc-atac-reduce.cwl | 10 +- tools/sc-ctype-assign.cwl | 6 +- tools/sc-multiome-filter.cwl | 154 ++++++++++++++-------------- tools/sc-rna-cluster.cwl | 2 +- tools/sc-rna-da-cells.cwl | 2 +- tools/sc-rna-de-pseudobulk.cwl | 2 +- tools/sc-rna-filter.cwl | 52 +++++----- tools/sc-rna-reduce.cwl | 14 +-- tools/sc-rna-trajectory.cwl | 2 +- tools/sc-triangulate.cwl | 2 +- tools/sc-vdj-profile.cwl | 2 +- tools/sc-wnn-cluster.cwl | 6 +- workflows/cellranger-atac-count.cwl | 4 +- 16 files changed, 139 insertions(+), 139 deletions(-) diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index 8b8d3c5e..17ad942d 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.32 + dockerPull: biowardrobe2/sc-tools:v0.0.33 inputs: @@ -469,7 +469,7 @@ outputs: outputBinding: glob: "*_cvrg_res_*.png" doc: | - Fragments coverage. + ATAC fragments coverage. PNG format cvrg_res_plot_pdf: @@ -480,7 +480,7 @@ outputs: outputBinding: glob: "*_cvrg_res_*.pdf" doc: | - Fragments coverage. + ATAC fragments coverage. PDF format peak_markers_tsv: diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index 0c91c2e4..79d61bda 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.32 + dockerPull: biowardrobe2/sc-tools:v0.0.33 inputs: @@ -150,7 +150,7 @@ outputs: outputBinding: glob: "*_frg_cov.bigWig" doc: | - Genome coverage calculated for fragments + Genome coverage calculated for ATAC fragments in bigWig format stdout_log: @@ -175,7 +175,7 @@ $schemas: label: "Single-cell ATAC-Seq Genome Coverage" s:name: "Single-cell ATAC-Seq Genome Coverage" -s:alternateName: "Creates genome coverage bigWig files from the provided fragments file and selected grouping parameters" +s:alternateName: "Creates genome coverage bigWig files from the provided ATAC fragments file and selected grouping parameters" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-atac-coverage.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -215,7 +215,7 @@ s:creator: doc: | Single-cell ATAC-Seq Genome Coverage - Creates genome coverage bigWig files from the provided fragments file + Creates genome coverage bigWig files from the provided ATAC fragments file and selected grouping parameters. --tmpdir parameter is not exposed as input. diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index 30ee25d6..d848a15a 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.32 + dockerPull: biowardrobe2/sc-tools:v0.0.33 inputs: @@ -369,7 +369,7 @@ outputs: glob: "*_first.bigWig" doc: | Genome coverage in bigWig format calculated - for fragments from the cells that belong to + for ATAC fragments from the cells that belong to the group defined by the --first and --groupby parameters. @@ -379,7 +379,7 @@ outputs: glob: "*_second.bigWig" doc: | Genome coverage in bigWig format calculated - for fragments from the cells that belong to + for ATAC fragments from the cells that belong to the group defined by the --second and --groupby parameters. diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index 2098bd11..f8f7bc81 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.32 + dockerPull: biowardrobe2/sc-tools:v0.0.33 inputs: @@ -384,7 +384,7 @@ outputs: glob: "*_umap_spl_frgm.png" doc: | UMAP, colored by dataset, split - by fragments in peaks per cell. + by ATAC fragments in peaks per cell. PNG format umap_spl_frgm_plot_pdf: @@ -393,7 +393,7 @@ outputs: glob: "*_umap_spl_frgm.pdf" doc: | UMAP, colored by dataset, split - by fragments in peaks per cell. + by ATAC fragments in peaks per cell. PDF format umap_spl_peak_plot_png: @@ -492,7 +492,7 @@ outputs: glob: "*_umap_gr_cnd_spl_frgm.png" doc: | UMAP, colored by grouping condition, - split by fragments in peaks per cell. + split by ATAC fragments in peaks per cell. PNG format umap_gr_cnd_spl_frgm_plot_pdf: @@ -501,7 +501,7 @@ outputs: glob: "*_umap_gr_cnd_spl_frgm.pdf" doc: | UMAP, colored by grouping condition, - split by fragments in peaks per cell. + split by ATAC fragments in peaks per cell. PDF format umap_gr_cnd_spl_peak_plot_png: diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index 03cf11bc..11f697de 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.32 + dockerPull: biowardrobe2/sc-tools:v0.0.33 inputs: @@ -829,7 +829,7 @@ outputs: outputBinding: glob: "*_cvrg_*.png" doc: | - Fragments coverage. + ATAC fragments coverage. PNG format cvrg_plot_pdf: @@ -840,7 +840,7 @@ outputs: outputBinding: glob: "*_cvrg_*.pdf" doc: | - Fragments coverage. + ATAC fragments coverage. PDF format xpr_htmp_plot_png: diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index 0954ae5f..c1fdcea7 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.32 + dockerPull: biowardrobe2/sc-tools:v0.0.33 inputs: @@ -158,7 +158,7 @@ inputs: inputBinding: prefix: "--minumis" doc: | - Include cells where at least this many UMI (transcripts) are detected. + Include cells where at least this many RNA reads are detected. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. Default: 500 (applied to all datasets) @@ -176,7 +176,7 @@ inputs: inputBinding: prefix: "--maxmt" doc: | - Include cells with the percentage of transcripts mapped to mitochondrial + Include cells with the percentage of RNA reads mapped to mitochondrial genes not bigger than this value. Default: 5 (applied to all datasets) @@ -210,7 +210,7 @@ inputs: inputBinding: prefix: "--minfragments" doc: | - Include cells where at least this many fragments in peaks are + Include cells where at least this many ATAC fragments in peaks are detected. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. @@ -226,7 +226,7 @@ inputs: doc: | Include cells with the nucleosome signal not bigger than this value. Nucleosome signal quantifies the approximate ratio of mononucleosomal - to nucleosome-free fragments. If multiple values provided, each of + to nucleosome-free ATAC fragments. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. Default: 4 (applied to all datasets) @@ -240,8 +240,8 @@ inputs: prefix: "--mintssenrich" doc: | Include cells with the TSS enrichment score not lower than this value. - Score is calculated based on the ratio of fragments centered at the TSS - to fragments in TSS-flanking regions. If multiple values provided, each + Score is calculated based on the ratio of ATAC fragments centered at the TSS + to ATAC fragments in TSS-flanking regions. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. Default: 2 (applied to all datasets) @@ -256,7 +256,7 @@ inputs: doc: | Include cells with the FRiP not lower than this value. If multiple values provided, each of them will be applied to the correspondent dataset from the - '--mex' input based on the '--identity' file. FRiP is calculated for fragments. + '--mex' input based on the '--identity' file. FRiP is calculated for ATAC fragments. Default: 0.15 (applied to all datasets) maximum_blacklist_fraction: @@ -267,7 +267,7 @@ inputs: inputBinding: prefix: "--maxblacklist" doc: | - Include cells with the fraction of fragments in + Include cells with the fraction of ATAC fragments in genomic blacklist regions not bigger than this value. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' @@ -498,7 +498,7 @@ outputs: outputBinding: glob: "*_raw_umi_dnst.png" doc: | - Transcripts per cell density (not filtered). + RNA reads per cell density (not filtered). PNG format raw_umi_dnst_plot_pdf: @@ -506,7 +506,7 @@ outputs: outputBinding: glob: "*_raw_umi_dnst.pdf" doc: | - Transcripts per cell density (not filtered). + RNA reads per cell density (not filtered). PDF format raw_gene_dnst_plot_png: @@ -530,7 +530,7 @@ outputs: outputBinding: glob: "*_raw_gene_umi.png" doc: | - Genes vs transcripts per cell (not filtered). + Genes vs RNA reads per cell (not filtered). PNG format raw_gene_umi_plot_pdf: @@ -538,7 +538,7 @@ outputs: outputBinding: glob: "*_raw_gene_umi.pdf" doc: | - Genes vs transcripts per cell (not filtered). + Genes vs RNA reads per cell (not filtered). PDF format raw_mito_dnst_plot_png: @@ -546,7 +546,7 @@ outputs: outputBinding: glob: "*_raw_mito_dnst.png" doc: | - Percentage of transcripts mapped to mitochondrial genes per cell density (not filtered). + Percentage of RNA reads mapped to mitochondrial genes per cell density (not filtered). PNG format raw_mito_dnst_plot_pdf: @@ -554,7 +554,7 @@ outputs: outputBinding: glob: "*_raw_mito_dnst.pdf" doc: | - Percentage of transcripts mapped to mitochondrial genes per cell density (not filtered). + Percentage of RNA reads mapped to mitochondrial genes per cell density (not filtered). PDF format raw_nvlt_dnst_plot_png: @@ -578,7 +578,7 @@ outputs: outputBinding: glob: "*_raw_frgm_dnst.png" doc: | - Fragments in peaks per cell density (not filtered). + ATAC fragments in peaks per cell density (not filtered). PNG format raw_frgm_dnst_plot_pdf: @@ -586,7 +586,7 @@ outputs: outputBinding: glob: "*_raw_frgm_dnst.pdf" doc: | - Fragments in peaks per cell density (not filtered). + ATAC fragments in peaks per cell density (not filtered). PDF format raw_peak_dnst_plot_png: @@ -626,7 +626,7 @@ outputs: outputBinding: glob: "*_raw_rna_atac_cnts.png" doc: | - Transcripts vs fragments in peaks per cell (not filtered). + RNA reads vs ATAC fragments in peaks per cell (not filtered). PNG format raw_rna_atac_cnts_plot_pdf: @@ -634,7 +634,7 @@ outputs: outputBinding: glob: "*_raw_rna_atac_cnts.pdf" doc: | - Transcripts vs fragments in peaks per cell (not filtered). + RNA reads vs ATAC fragments in peaks per cell (not filtered). PDF format raw_tss_frgm_plot_png: @@ -642,7 +642,7 @@ outputs: outputBinding: glob: "*_raw_tss_frgm.png" doc: | - TSS enrichment score vs fragments in peaks per cell (not filtered). + TSS enrichment score vs ATAC fragments in peaks per cell (not filtered). PNG format raw_tss_frgm_plot_pdf: @@ -650,7 +650,7 @@ outputs: outputBinding: glob: "*_raw_tss_frgm.pdf" doc: | - TSS enrichment score vs fragments in peaks per cell (not filtered). + TSS enrichment score vs ATAC fragments in peaks per cell (not filtered). PDF format raw_qc_mtrcs_dnst_plot_png: @@ -738,7 +738,7 @@ outputs: outputBinding: glob: "*_raw_frgm_hist.png" doc: | - Fragments length histogram (not filtered). + ATAC fragments length histogram (not filtered). PNG format raw_frgm_hist_pdf: @@ -746,7 +746,7 @@ outputs: outputBinding: glob: "*_raw_frgm_hist.pdf" doc: | - Fragments length histogram (not filtered). + ATAC fragments length histogram (not filtered). PDF format raw_umi_dnst_spl_cnd_plot_png: @@ -754,7 +754,7 @@ outputs: outputBinding: glob: "*_raw_umi_dnst_spl_cnd.png" doc: | - Split by grouping condition transcripts per cell density (not filtered). + Split by grouping condition RNA reads per cell density (not filtered). PNG format raw_umi_dnst_spl_cnd_plot_pdf: @@ -762,7 +762,7 @@ outputs: outputBinding: glob: "*_raw_umi_dnst_spl_cnd.pdf" doc: | - Split by grouping condition transcripts per cell density (not filtered). + Split by grouping condition RNA reads per cell density (not filtered). PDF format raw_gene_dnst_spl_cnd_plot_png: @@ -786,7 +786,7 @@ outputs: outputBinding: glob: "*_raw_mito_dnst_spl_cnd.png" doc: | - Split by grouping condition the percentage of transcripts mapped + Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density (not filtered). PNG format @@ -795,7 +795,7 @@ outputs: outputBinding: glob: "*_raw_mito_dnst_spl_cnd.pdf" doc: | - Split by grouping condition the percentage of transcripts mapped + Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density (not filtered). PDF format @@ -820,7 +820,7 @@ outputs: outputBinding: glob: "*_raw_frgm_dnst_spl_cnd.png" doc: | - Split by grouping condition fragments in peaks per cell density (not filtered). + Split by grouping condition ATAC fragments in peaks per cell density (not filtered). PNG format raw_frgm_dnst_spl_cnd_plot_pdf: @@ -828,7 +828,7 @@ outputs: outputBinding: glob: "*_raw_frgm_dnst_spl_cnd.pdf" doc: | - Split by grouping condition fragments in peaks per cell density (not filtered). + Split by grouping condition ATAC fragments in peaks per cell density (not filtered). PDF format raw_peak_dnst_spl_cnd_plot_png: @@ -918,7 +918,7 @@ outputs: outputBinding: glob: "*_mid_fltr_umi_dnst.png" doc: | - Transcripts per cell density (intermediate filtered). + RNA reads per cell density (intermediate filtered). PNG format mid_fltr_umi_dnst_plot_pdf: @@ -926,7 +926,7 @@ outputs: outputBinding: glob: "*_mid_fltr_umi_dnst.pdf" doc: | - Transcripts per cell density (intermediate filtered). + RNA reads per cell density (intermediate filtered). PDF format mid_fltr_gene_dnst_plot_png: @@ -950,7 +950,7 @@ outputs: outputBinding: glob: "*_mid_fltr_gene_umi.png" doc: | - Genes vs transcripts per cell (intermediate filtered). + Genes vs RNA reads per cell (intermediate filtered). PNG format mid_fltr_gene_umi_plot_pdf: @@ -958,7 +958,7 @@ outputs: outputBinding: glob: "*_mid_fltr_gene_umi.pdf" doc: | - Genes vs transcripts per cell (intermediate filtered). + Genes vs RNA reads per cell (intermediate filtered). PDF format mid_fltr_mito_dnst_plot_png: @@ -966,7 +966,7 @@ outputs: outputBinding: glob: "*_mid_fltr_mito_dnst.png" doc: | - Percentage of transcripts mapped to mitochondrial genes per cell density (intermediate filtered). + Percentage of RNA reads mapped to mitochondrial genes per cell density (intermediate filtered). PNG format mid_fltr_mito_dnst_plot_pdf: @@ -974,7 +974,7 @@ outputs: outputBinding: glob: "*_mid_fltr_mito_dnst.pdf" doc: | - Percentage of transcripts mapped to mitochondrial genes per cell density (intermediate filtered). + Percentage of RNA reads mapped to mitochondrial genes per cell density (intermediate filtered). PDF format mid_fltr_nvlt_dnst_plot_png: @@ -998,7 +998,7 @@ outputs: outputBinding: glob: "*_mid_fltr_frgm_dnst.png" doc: | - Fragments in peaks per cell density (intermediate filtered). + ATAC fragments in peaks per cell density (intermediate filtered). PNG format mid_fltr_frgm_dnst_plot_pdf: @@ -1006,7 +1006,7 @@ outputs: outputBinding: glob: "*_mid_fltr_frgm_dnst.pdf" doc: | - Fragments in peaks per cell density (intermediate filtered). + ATAC fragments in peaks per cell density (intermediate filtered). PDF format mid_fltr_peak_dnst_plot_png: @@ -1046,7 +1046,7 @@ outputs: outputBinding: glob: "*_mid_fltr_rna_atac_cnts.png" doc: | - Transcripts vs fragments in peaks per cell (intermediate filtered). + RNA reads vs ATAC fragments in peaks per cell (intermediate filtered). PNG format mid_fltr_rna_atac_cnts_plot_pdf: @@ -1054,7 +1054,7 @@ outputs: outputBinding: glob: "*_mid_fltr_rna_atac_cnts.pdf" doc: | - Transcripts vs fragments in peaks per cell (intermediate filtered). + RNA reads vs ATAC fragments in peaks per cell (intermediate filtered). PDF format mid_fltr_tss_frgm_plot_png: @@ -1062,7 +1062,7 @@ outputs: outputBinding: glob: "*_mid_fltr_tss_frgm.png" doc: | - TSS enrichment score vs fragments in peaks per cell (intermediate filtered). + TSS enrichment score vs ATAC fragments in peaks per cell (intermediate filtered). PNG format mid_fltr_tss_frgm_plot_pdf: @@ -1070,7 +1070,7 @@ outputs: outputBinding: glob: "*_mid_fltr_tss_frgm.pdf" doc: | - TSS enrichment score vs fragments in peaks per cell (intermediate filtered). + TSS enrichment score vs ATAC fragments in peaks per cell (intermediate filtered). PDF format mid_fltr_qc_mtrcs_dnst_plot_png: @@ -1158,7 +1158,7 @@ outputs: outputBinding: glob: "*_mid_fltr_frgm_hist.png" doc: | - Fragments length histogram (intermediate filtered). + ATAC fragments length histogram (intermediate filtered). PNG format mid_fltr_frgm_hist_pdf: @@ -1166,7 +1166,7 @@ outputs: outputBinding: glob: "*_mid_fltr_frgm_hist.pdf" doc: | - Fragments length histogram (intermediate filtered). + ATAC fragments length histogram (intermediate filtered). PDF format mid_fltr_umi_dnst_spl_cnd_plot_png: @@ -1174,7 +1174,7 @@ outputs: outputBinding: glob: "*_mid_fltr_umi_dnst_spl_cnd.png" doc: | - Split by grouping condition transcripts per cell density (intermediate filtered). + Split by grouping condition RNA reads per cell density (intermediate filtered). PNG format mid_fltr_umi_dnst_spl_cnd_plot_pdf: @@ -1182,7 +1182,7 @@ outputs: outputBinding: glob: "*_mid_fltr_umi_dnst_spl_cnd.pdf" doc: | - Split by grouping condition transcripts per cell density (intermediate filtered). + Split by grouping condition RNA reads per cell density (intermediate filtered). PDF format mid_fltr_gene_dnst_spl_cnd_plot_png: @@ -1206,7 +1206,7 @@ outputs: outputBinding: glob: "*_mid_fltr_mito_dnst_spl_cnd.png" doc: | - Split by grouping condition the percentage of transcripts mapped + Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density (intermediate filtered). PNG format @@ -1215,7 +1215,7 @@ outputs: outputBinding: glob: "*_mid_fltr_mito_dnst_spl_cnd.pdf" doc: | - Split by grouping condition the percentage of transcripts mapped + Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density (intermediate filtered). PDF format @@ -1240,7 +1240,7 @@ outputs: outputBinding: glob: "*_mid_fltr_frgm_dnst_spl_cnd.png" doc: | - Split by grouping condition fragments in peaks per cell density (intermediate filtered). + Split by grouping condition ATAC fragments in peaks per cell density (intermediate filtered). PNG format mid_fltr_frgm_dnst_spl_cnd_plot_pdf: @@ -1248,7 +1248,7 @@ outputs: outputBinding: glob: "*_mid_fltr_frgm_dnst_spl_cnd.pdf" doc: | - Split by grouping condition fragments in peaks per cell density (intermediate filtered). + Split by grouping condition ATAC fragments in peaks per cell density (intermediate filtered). PDF format mid_fltr_peak_dnst_spl_cnd_plot_png: @@ -1338,7 +1338,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_umi_dnst.png" doc: | - Transcripts per cell density (filtered). + RNA reads per cell density (filtered). PNG format fltr_umi_dnst_plot_pdf: @@ -1346,7 +1346,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_umi_dnst.pdf" doc: | - Transcripts per cell density (filtered). + RNA reads per cell density (filtered). PDF format fltr_gene_dnst_plot_png: @@ -1370,7 +1370,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_gene_umi.png" doc: | - Genes vs transcripts per cell (filtered). + Genes vs RNA reads per cell (filtered). PNG format fltr_gene_umi_plot_pdf: @@ -1378,7 +1378,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_gene_umi.pdf" doc: | - Genes vs transcripts per cell (filtered). + Genes vs RNA reads per cell (filtered). PDF format fltr_mito_dnst_plot_png: @@ -1386,7 +1386,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_mito_dnst.png" doc: | - Percentage of transcripts mapped to mitochondrial genes per cell density (filtered). + Percentage of RNA reads mapped to mitochondrial genes per cell density (filtered). PNG format fltr_mito_dnst_plot_pdf: @@ -1394,7 +1394,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_mito_dnst.pdf" doc: | - Percentage of transcripts mapped to mitochondrial genes per cell density (filtered). + Percentage of RNA reads mapped to mitochondrial genes per cell density (filtered). PDF format fltr_nvlt_dnst_plot_png: @@ -1418,7 +1418,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_frgm_dnst.png" doc: | - Fragments in peaks per cell density (filtered). + ATAC fragments in peaks per cell density (filtered). PNG format fltr_frgm_dnst_plot_pdf: @@ -1426,7 +1426,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_frgm_dnst.pdf" doc: | - Fragments in peaks per cell density (filtered). + ATAC fragments in peaks per cell density (filtered). PDF format fltr_peak_dnst_plot_png: @@ -1466,7 +1466,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_rna_atac_cnts.png" doc: | - Transcripts vs fragments in peaks per cell (filtered). + RNA reads vs ATAC fragments in peaks per cell (filtered). PNG format fltr_rna_atac_cnts_plot_pdf: @@ -1474,7 +1474,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_rna_atac_cnts.pdf" doc: | - Transcripts vs fragments in peaks per cell (filtered). + RNA reads vs ATAC fragments in peaks per cell (filtered). PDF format fltr_rnadbl_plot_png: @@ -1530,7 +1530,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_tss_frgm.png" doc: | - TSS enrichment score vs fragments in peaks per cell (filtered). + TSS enrichment score vs ATAC fragments in peaks per cell (filtered). PNG format fltr_tss_frgm_plot_pdf: @@ -1538,7 +1538,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_tss_frgm.pdf" doc: | - TSS enrichment score vs fragments in peaks per cell (filtered). + TSS enrichment score vs ATAC fragments in peaks per cell (filtered). PDF format fltr_qc_mtrcs_dnst_plot_png: @@ -1578,7 +1578,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_frgm_hist.png" doc: | - Fragments length histogram (filtered). + ATAC fragments length histogram (filtered). PNG format fltr_frgm_hist_pdf: @@ -1586,7 +1586,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_frgm_hist.pdf" doc: | - Fragments length histogram (filtered). + ATAC fragments length histogram (filtered). PDF format fltr_umi_dnst_spl_cnd_plot_png: @@ -1594,7 +1594,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_umi_dnst_spl_cnd.png" doc: | - Split by grouping condition transcripts per cell density (filtered). + Split by grouping condition RNA reads per cell density (filtered). PNG format fltr_umi_dnst_spl_cnd_plot_pdf: @@ -1602,7 +1602,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_umi_dnst_spl_cnd.pdf" doc: | - Split by grouping condition transcripts per cell density (filtered). + Split by grouping condition RNA reads per cell density (filtered). PDF format fltr_gene_dnst_spl_cnd_plot_png: @@ -1626,7 +1626,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_mito_dnst_spl_cnd.png" doc: | - Split by grouping condition the percentage of transcripts mapped + Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density (filtered). PNG format @@ -1635,7 +1635,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_mito_dnst_spl_cnd.pdf" doc: | - Split by grouping condition the percentage of transcripts mapped + Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density (filtered). PDF format @@ -1660,7 +1660,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_frgm_dnst_spl_cnd.png" doc: | - Split by grouping condition fragments in peaks per cell density (filtered). + Split by grouping condition ATAC fragments in peaks per cell density (filtered). PNG format fltr_frgm_dnst_spl_cnd_plot_pdf: @@ -1668,7 +1668,7 @@ outputs: outputBinding: glob: "*[!_mid]_fltr_frgm_dnst_spl_cnd.pdf" doc: | - Split by grouping condition fragments in peaks per cell density (filtered). + Split by grouping condition ATAC fragments in peaks per cell density (filtered). PDF format fltr_peak_dnst_spl_cnd_plot_png: @@ -1918,8 +1918,8 @@ s:about: | '--mex' input based on the '--identity' file. Default: 5000 (applied to all datasets) --minumis [MINUMIS [MINUMIS ...]] - Include cells where at least this many UMI - (transcripts) are detected. If multiple values + Include cells where at least this many RNA reads + are detected. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. Default: 500 (applied to all @@ -1927,7 +1927,7 @@ s:about: | --mitopattern MITOPATTERN Regex pattern to identify mitochondrial genes. Default: '^mt-|^MT-' - --maxmt MAXMT Include cells with the percentage of transcripts + --maxmt MAXMT Include cells with the percentage of RNA reads mapped to mitochondrial genes not bigger than this value. Default: 5 (applied to all datasets) --minnovelty [MINNOVELTY [MINNOVELTY ...]] @@ -1941,7 +1941,7 @@ s:about: | Include only peaks detected in at least this many cells. Default: 5 (applied to all datasets) --minfragments [MINFRAGMENTS [MINFRAGMENTS ...]] - Include cells where at least this many fragments in + Include cells where at least this many ATAC fragments in peaks are detected. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. @@ -1950,14 +1950,14 @@ s:about: | Include cells with the nucleosome signal not bigger than this value. Nucleosome signal quantifies the approximate ratio of mononucleosomal to nucleosome- - free fragments. If multiple values provided, each of + free ATAC fragments. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. Default: 4 (applied to all datasets) --mintssenrich [MINTSSENRICH [MINTSSENRICH ...]] Include cells with the TSS enrichment score not lower than this value. Score is calculated based on the - ratio of fragments centered at the TSS to fragments in + ratio of ATAC fragments centered at the TSS to ATAC fragments in TSS-flanking regions. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '-- @@ -1967,10 +1967,10 @@ s:about: | If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. FRiP is - calculated for fragments. Default: 0.15 (applied to + calculated for ATAC fragments. Default: 0.15 (applied to all datasets) --maxblacklist [MAXBLACKLIST [MAXBLACKLIST ...]] - Include cells with the fraction of fragments in + Include cells with the fraction of ATAC fragments in genomic blacklist regions not bigger than this value. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index f17f312b..05ea0a20 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.32 + dockerPull: biowardrobe2/sc-tools:v0.0.33 inputs: diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index 4e4c379d..e204cf2a 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.32 + dockerPull: biowardrobe2/sc-tools:v0.0.33 inputs: diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index 997996d6..58a9d7ed 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.32 + dockerPull: biowardrobe2/sc-tools:v0.0.33 inputs: diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index 1d883441..e7c68712 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.32 + dockerPull: biowardrobe2/sc-tools:v0.0.33 inputs: @@ -103,7 +103,7 @@ inputs: inputBinding: prefix: "--minumis" doc: | - Include cells where at least this many UMI (transcripts) are detected. + Include cells where at least this many RNA reads are detected. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. Default: 500 (applied to all datasets) @@ -135,7 +135,7 @@ inputs: inputBinding: prefix: "--maxmt" doc: | - Include cells with the percentage of transcripts mapped to mitochondrial + Include cells with the percentage of RNA reads mapped to mitochondrial genes not bigger than this value. Default: 5 (applied to all datasets) @@ -314,7 +314,7 @@ outputs: outputBinding: glob: "*_raw_umi_dnst.png" doc: | - Transcripts per cell density (not filtered). + RNA reads per cell density (not filtered). PNG format raw_umi_dnst_plot_pdf: @@ -322,7 +322,7 @@ outputs: outputBinding: glob: "*_raw_umi_dnst.pdf" doc: | - Transcripts per cell density (not filtered). + RNA reads per cell density (not filtered). PDF format raw_gene_dnst_plot_png: @@ -346,7 +346,7 @@ outputs: outputBinding: glob: "*_raw_gene_umi.png" doc: | - Genes vs transcripts per cell correlation (not filtered). + Genes vs RNA reads per cell correlation (not filtered). PNG format raw_gene_umi_plot_pdf: @@ -354,7 +354,7 @@ outputs: outputBinding: glob: "*_raw_gene_umi.pdf" doc: | - Genes vs transcripts per cell correlation (not filtered). + Genes vs RNA reads per cell correlation (not filtered). PDF format raw_mito_dnst_plot_png: @@ -362,7 +362,7 @@ outputs: outputBinding: glob: "*_raw_mito_dnst.png" doc: | - Percentage of transcripts mapped to mitochondrial genes per cell density (not filtered). + Percentage of RNA reads mapped to mitochondrial genes per cell density (not filtered). PNG format raw_mito_dnst_plot_pdf: @@ -370,7 +370,7 @@ outputs: outputBinding: glob: "*_raw_mito_dnst.pdf" doc: | - Percentage of transcripts mapped to mitochondrial genes per cell density (not filtered). + Percentage of RNA reads mapped to mitochondrial genes per cell density (not filtered). PDF format raw_nvlt_dnst_plot_png: @@ -426,7 +426,7 @@ outputs: outputBinding: glob: "*_raw_umi_dnst_spl_cnd.png" doc: | - Split by grouping condition transcripts per cell density (not filtered). + Split by grouping condition RNA reads per cell density (not filtered). PNG format raw_umi_dnst_spl_cnd_plot_pdf: @@ -434,7 +434,7 @@ outputs: outputBinding: glob: "*_raw_umi_dnst_spl_cnd.pdf" doc: | - Split by grouping condition transcripts per cell density (not filtered). + Split by grouping condition RNA reads per cell density (not filtered). PDF format raw_gene_dnst_spl_cnd_plot_png: @@ -458,7 +458,7 @@ outputs: outputBinding: glob: "*_raw_mito_dnst_spl_cnd.png" doc: | - Split by grouping condition the percentage of transcripts mapped + Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density (not filtered). PNG format @@ -467,7 +467,7 @@ outputs: outputBinding: glob: "*_raw_mito_dnst_spl_cnd.pdf" doc: | - Split by grouping condition the percentage of transcripts mapped + Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density (not filtered). PDF format @@ -540,7 +540,7 @@ outputs: outputBinding: glob: "*_fltr_umi_dnst.png" doc: | - Transcripts per cell density (filtered). + RNA reads per cell density (filtered). PNG format fltr_umi_dnst_plot_pdf: @@ -548,7 +548,7 @@ outputs: outputBinding: glob: "*_fltr_umi_dnst.pdf" doc: | - Transcripts per cell density (filtered). + RNA reads per cell density (filtered). PDF format fltr_gene_dnst_plot_png: @@ -572,7 +572,7 @@ outputs: outputBinding: glob: "*_fltr_gene_umi.png" doc: | - Genes vs transcripts per cell correlation (filtered). + Genes vs RNA reads per cell correlation (filtered). PNG format fltr_gene_umi_plot_pdf: @@ -580,7 +580,7 @@ outputs: outputBinding: glob: "*_fltr_gene_umi.pdf" doc: | - Genes vs transcripts per cell correlation (filtered). + Genes vs RNA reads per cell correlation (filtered). PDF format fltr_mito_dnst_plot_png: @@ -588,7 +588,7 @@ outputs: outputBinding: glob: "*_fltr_mito_dnst.png" doc: | - Percentage of transcripts mapped to mitochondrial genes per cell density (filtered). + Percentage of RNA reads mapped to mitochondrial genes per cell density (filtered). PNG format fltr_mito_dnst_plot_pdf: @@ -596,7 +596,7 @@ outputs: outputBinding: glob: "*_fltr_mito_dnst.pdf" doc: | - Percentage of transcripts mapped to mitochondrial genes per cell density (filtered). + Percentage of RNA reads mapped to mitochondrial genes per cell density (filtered). PDF format fltr_nvlt_dnst_plot_png: @@ -652,7 +652,7 @@ outputs: outputBinding: glob: "*_fltr_umi_dnst_spl_cnd.png" doc: | - Split by grouping condition transcripts per cell density (filtered). + Split by grouping condition RNA reads per cell density (filtered). PNG format fltr_umi_dnst_spl_cnd_plot_pdf: @@ -660,7 +660,7 @@ outputs: outputBinding: glob: "*_fltr_umi_dnst_spl_cnd.pdf" doc: | - Split by grouping condition transcripts per cell density (filtered). + Split by grouping condition RNA reads per cell density (filtered). PDF format fltr_gene_dnst_spl_cnd_plot_png: @@ -684,7 +684,7 @@ outputs: outputBinding: glob: "*_fltr_mito_dnst_spl_cnd.png" doc: | - Split by grouping condition the percentage of transcripts mapped + Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density (filtered). PNG format @@ -693,7 +693,7 @@ outputs: outputBinding: glob: "*_fltr_mito_dnst_spl_cnd.pdf" doc: | - Split by grouping condition the percentage of transcripts mapped + Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density (filtered). PDF format @@ -904,8 +904,8 @@ s:about: | '--mex' input based on the '--identity' file. Default: 5000 (applied to all datasets) --minumis [MINUMIS [MINUMIS ...]] - Include cells where at least this many UMI - (transcripts) are detected. If multiple values + Include cells where at least this many RNA reads + are detected. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. Default: 500 (applied to all @@ -920,7 +920,7 @@ s:about: | --mitopattern MITOPATTERN Regex pattern to identify mitochondrial genes. Default: '^mt-|^MT-' - --maxmt MAXMT Include cells with the percentage of transcripts + --maxmt MAXMT Include cells with the percentage of RNA reads mapped to mitochondrial genes not bigger than this value. Default: 5 (applied to all datasets) --removedoublets Remove cells that were identified as doublets. Cells diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index 3544b72f..6c5620f7 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.32 + dockerPull: biowardrobe2/sc-tools:v0.0.33 inputs: @@ -143,7 +143,7 @@ inputs: inputBinding: prefix: "--regressmt" doc: | - Regress the percentage of transcripts mapped to mitochondrial genes as a + Regress the percentage of RNA reads mapped to mitochondrial genes as a confounding source of variation. Default: false @@ -499,7 +499,7 @@ outputs: glob: "*_umap_spl_umi.png" doc: | UMAP, colored by dataset, split by - transcripts per cell. + RNA reads per cell. PNG format umap_spl_umi_plot_pdf: @@ -508,7 +508,7 @@ outputs: glob: "*_umap_spl_umi.pdf" doc: | UMAP, colored by dataset, split by - transcripts per cell. + RNA reads per cell. PDF format umap_spl_gene_plot_png: @@ -641,7 +641,7 @@ outputs: glob: "*_umap_gr_cnd_spl_umi.png" doc: | UMAP, colored by grouping condition, - split by transcripts per cell. + split by RNA reads per cell. PNG format umap_gr_cnd_spl_umi_plot_pdf: @@ -650,7 +650,7 @@ outputs: glob: "*_umap_gr_cnd_spl_umi.pdf" doc: | UMAP, colored by grouping condition, - split by transcripts per cell. + split by RNA reads per cell. PDF format umap_gr_cnd_spl_gene_plot_png: @@ -861,7 +861,7 @@ s:about: | Number of highly variable genes used in datasets integration, scaling and dimensionality reduction. Default: 3000 - --regressmt Regress the percentage of transcripts mapped to + --regressmt Regress the percentage of RNA reads mapped to mitochondrial genes as a confounding source of variation. Default: false --regressgenes REGRESSGENES diff --git a/tools/sc-rna-trajectory.cwl b/tools/sc-rna-trajectory.cwl index ab5a31e9..b3619ab7 100644 --- a/tools/sc-rna-trajectory.cwl +++ b/tools/sc-rna-trajectory.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.32 + dockerPull: biowardrobe2/sc-tools:v0.0.33 inputs: diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index debe1e31..eda197d0 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.32 + dockerPull: biowardrobe2/sc-tools:v0.0.33 inputs: diff --git a/tools/sc-vdj-profile.cwl b/tools/sc-vdj-profile.cwl index 116e1d2c..f4e73ca1 100644 --- a/tools/sc-vdj-profile.cwl +++ b/tools/sc-vdj-profile.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.32 + dockerPull: biowardrobe2/sc-tools:v0.0.33 inputs: diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index a575ead5..5d1723fa 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.32 + dockerPull: biowardrobe2/sc-tools:v0.0.33 inputs: @@ -747,7 +747,7 @@ outputs: outputBinding: glob: "*_cvrg_res_*.png" doc: | - Fragments coverage. + ATAC fragments coverage. PNG format cvrg_res_plot_pdf: @@ -758,7 +758,7 @@ outputs: outputBinding: glob: "*_cvrg_res_*.pdf" doc: | - Fragments coverage. + ATAC fragments coverage. PDF format xpr_htmp_res_plot_png: diff --git a/workflows/cellranger-atac-count.cwl b/workflows/cellranger-atac-count.cwl index a9569e2e..f7e14a4f 100644 --- a/workflows/cellranger-atac-count.cwl +++ b/workflows/cellranger-atac-count.cwl @@ -56,9 +56,9 @@ inputs: force_cells: type: int? default: null - label: "Define the top N barcodes with the most fragments overlapping peaks as cells" + label: "Define the top N barcodes with the most ATAC fragments overlapping peaks as cells" doc: | - Define the top N barcodes with the most fragments overlapping + Define the top N barcodes with the most ATAC fragments overlapping peaks as cells. N must be a positive integer <= 20,000. Please consult the documentation before using this option 'sd:layout': From bef497c94f255e0cdc20de6cfa2b9f3e38d50e5b Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 22 Jan 2024 18:49:18 -0500 Subject: [PATCH 106/162] Refactor sc-rna-filter pipeline. Update some other labels and docs --- workflows/cellranger-aggr.cwl | 14 +- workflows/cellranger-arc-aggr.cwl | 22 +- workflows/cellranger-arc-count.cwl | 3 +- workflows/cellranger-atac-aggr.cwl | 10 +- workflows/cellranger-atac-count.cwl | 2 +- workflows/cellranger-reanalyze.cwl | 3 +- workflows/diffbind-multi-factor.cwl | 10 +- workflows/sc-atac-cluster.cwl | 30 +- workflows/sc-atac-dbinding.cwl | 20 +- workflows/sc-atac-reduce.cwl | 4 +- workflows/sc-ctype-assign.cwl | 29 +- workflows/sc-multiome-filter.cwl | 105 +++-- workflows/sc-rna-cluster.cwl | 4 +- workflows/sc-rna-da-cells.cwl | 8 +- workflows/sc-rna-de-pseudobulk.cwl | 10 +- workflows/sc-rna-filter.cwl | 660 +++++++++++++++------------- workflows/sc-rna-reduce.cwl | 4 +- workflows/sc-rna-trajectory.cwl | 4 +- workflows/sc-triangulate.cwl | 8 +- workflows/sc-vdj-profile.cwl | 20 +- workflows/sc-wnn-cluster.cwl | 21 +- 21 files changed, 544 insertions(+), 447 deletions(-) diff --git a/workflows/cellranger-aggr.cwl b/workflows/cellranger-aggr.cwl index 39a74710..689872f0 100644 --- a/workflows/cellranger-aggr.cwl +++ b/workflows/cellranger-aggr.cwl @@ -27,8 +27,12 @@ inputs: type: - "null" - File[] - label: "Single-cell Experiment" - doc: "Molecule-level information from individual runs of cellranger count" + label: "Cell Ranger RNA or RNA+VDJ Sample" + doc: | + Any "Cell Ranger RNA or RNA+VDJ Sample" + that produces gene expression and, + optionally, V(D)J contigs data, from a + single 10x Genomics library 'sd:upstreamSource': "sc_experiment/molecule_info_h5" 'sd:localLabel': true @@ -36,17 +40,11 @@ inputs: type: - "null" - Directory[] - label: "Single-cell Experiment" - doc: "Filtered data folders from individual runs of cellranger multi" 'sd:upstreamSource': "sc_experiment/filtered_data_folder" - 'sd:localLabel': true gem_well_labels: type: string[] - label: "Single-cell Experiment" - doc: "Array of GEM well identifiers to be used for labeling purposes only" 'sd:upstreamSource': "sc_experiment/alias" - 'sd:localLabel': true normalization_mode: type: diff --git a/workflows/cellranger-arc-aggr.cwl b/workflows/cellranger-arc-aggr.cwl index ac643f33..c51e57a7 100644 --- a/workflows/cellranger-arc-aggr.cwl +++ b/workflows/cellranger-arc-aggr.cwl @@ -10,7 +10,7 @@ requirements: 'sd:upstream': - sc_rnaseq_sample: + sc_arc_sample: - "cellranger-arc-count.cwl" genome_indices: - "cellranger-mkref.cwl" @@ -26,28 +26,28 @@ inputs: gex_molecule_info_h5: type: File[] - label: "Cell Ranger ARC Sample" + label: "Cell Ranger RNA+ATAC Sample" doc: | - Any "Cell Ranger ARC Sample" that - produces RNA molecule-level data, - ATAC fragments, and ATAC and RNA - barcode metrics files. - 'sd:upstreamSource': "sc_rnaseq_sample/gex_molecule_info_h5" + Any "Cell Ranger RNA+ATAC Sample" + that produces both gene expression + and chromatin accessibility data + from a single 10x Genomics library + 'sd:upstreamSource': "sc_arc_sample/gex_molecule_info_h5" 'sd:localLabel': true gem_well_labels: type: string[] - 'sd:upstreamSource': "sc_rnaseq_sample/alias" + 'sd:upstreamSource': "sc_arc_sample/alias" atac_fragments_file_from_count: type: File[] secondaryFiles: - .tbi - 'sd:upstreamSource': "sc_rnaseq_sample/atac_fragments_file" + 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" barcode_metrics_report: type: File[] - 'sd:upstreamSource': "sc_rnaseq_sample/barcode_metrics_report" + 'sd:upstreamSource': "sc_arc_sample/barcode_metrics_report" indices_folder: type: Directory @@ -400,4 +400,4 @@ doc: | Combines outputs from multiple runs of “Cell Ranger Count (RNA+ATAC)” pipeline. The results of this workflow are primarily used in - “Single-Cell Multiome ATAC and RNA-Seq Filtering Analysis” pipeline. + “Single-Cell Multiome ATAC and RNA-Seq Filtering Analysis” pipeline. \ No newline at end of file diff --git a/workflows/cellranger-arc-count.cwl b/workflows/cellranger-arc-count.cwl index 022f0b16..f25d05ec 100644 --- a/workflows/cellranger-arc-count.cwl +++ b/workflows/cellranger-arc-count.cwl @@ -32,7 +32,8 @@ inputs: 'sd:localLabel': true memory_limit: - type: int + type: int? + default: 20 'sd:upstreamSource': "genome_indices/memory_limit" gex_fastq_file_r1: diff --git a/workflows/cellranger-atac-aggr.cwl b/workflows/cellranger-atac-aggr.cwl index 7036680a..fbed12f1 100644 --- a/workflows/cellranger-atac-aggr.cwl +++ b/workflows/cellranger-atac-aggr.cwl @@ -316,8 +316,8 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cellranger Aggregate (ATAC)" -s:name: "Cellranger Aggregate (ATAC)" +label: "Cell Ranger Aggregate (ATAC)" +s:name: "Cell Ranger Aggregate (ATAC)" s:alternateName: "Combines outputs from multiple runs of Cell Ranger Count (ATAC) pipeline" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/cellranger-atac-aggr.cwl @@ -356,7 +356,7 @@ s:creator: doc: | - Cellranger Aggregate (ATAC) + Cell Ranger Aggregate (ATAC) - Combines outputs from multiple runs of “Cell Ranger - Count (ATAC)” pipeline. + Combines outputs from multiple runs of + “Cell Ranger Count (ATAC)” pipeline. \ No newline at end of file diff --git a/workflows/cellranger-atac-count.cwl b/workflows/cellranger-atac-count.cwl index f7e14a4f..eb81ae82 100644 --- a/workflows/cellranger-atac-count.cwl +++ b/workflows/cellranger-atac-count.cwl @@ -520,5 +520,5 @@ doc: | Quantifies single-cell chromatin accessibility of the sequencing data from a single 10x Genomics library. The results of this - workflow are primarily used in “Cellranger Aggregate (ATAC)” + workflow are primarily used in “Cell Ranger Aggregate (ATAC)” pipeline. \ No newline at end of file diff --git a/workflows/cellranger-reanalyze.cwl b/workflows/cellranger-reanalyze.cwl index 1d815d3e..cb34623b 100644 --- a/workflows/cellranger-reanalyze.cwl +++ b/workflows/cellranger-reanalyze.cwl @@ -613,4 +613,5 @@ s:creator: doc: | - Cellranger Reanalyze \ No newline at end of file + Cellranger Reanalyze + ==================== \ No newline at end of file diff --git a/workflows/diffbind-multi-factor.cwl b/workflows/diffbind-multi-factor.cwl index e5d5f801..19a1d65b 100644 --- a/workflows/diffbind-multi-factor.cwl +++ b/workflows/diffbind-multi-factor.cwl @@ -22,8 +22,13 @@ requirements: - "trim-chipseq-pe.cwl" - "trim-atacseq-se.cwl" - "trim-atacseq-pe.cwl" + - "https://github.com/datirium/workflows/workflows/trim-chipseq-se.cwl" + - "https://github.com/datirium/workflows/workflows/trim-chipseq-pe.cwl" + - "https://github.com/datirium/workflows/workflows/trim-atacseq-se.cwl" + - "https://github.com/datirium/workflows/workflows/trim-atacseq-pe.cwl" genome_indices: - "genome-indices.cwl" + - "https://github.com/datirium/workflows/workflows/genome-indices.cwl" inputs: @@ -792,7 +797,7 @@ steps: - stdout_log - stderr_log - pdf_plots: + folder_pdf_plots: run: ../tools/files-to-folder.cwl in: input_files: @@ -816,7 +821,7 @@ steps: compress_pdf_plots: run: ../tools/tar-compress.cwl in: - folder_to_compress: pdf_plots/folder + folder_to_compress: folder_pdf_plots/folder out: - compressed_folder @@ -1064,6 +1069,7 @@ s:creator: doc: | DiffBind Multi-factor Analysis + ------------------------------ DiffBind processes ChIP-Seq data enriched for genomic loci where specific protein/DNA binding occurs, including peak sets identified by ChIP-Seq peak callers and aligned sequence read datasets. It is designed to work with multiple peak sets simultaneously, representing different ChIP experiments (antibodies, transcription diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index 8d4a0025..74836d81 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -27,10 +27,11 @@ requirements: - "sc-rna-cluster.cwl" - "sc-rna-reduce.cwl" - "sc-atac-reduce.cwl" - sc_arc_sample: + sc_atac_sample: - "cellranger-arc-count.cwl" - "cellranger-arc-aggr.cwl" - + - "cellranger-atac-count.cwl" + - "cellranger-atac-aggr.cwl" inputs: @@ -57,12 +58,17 @@ inputs: type: File? secondaryFiles: - .tbi - label: "Cell Ranger ARC Sample (optional)" + label: "Cell Ranger ATAC or RNA+ATAC Sample (optional)" doc: | - "Cell Ranger ARC Sample" for generating - ATAC fragments coverage plots over the genes - of interest. - 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" + Any "Cell Ranger ATAC or RNA+ATAC Sample" + for generating ATAC fragments coverage + plots over the genes of interest. This + sample can be analyzed with one of the + following pipelines: "Cell Ranger Count + (RNA+ATAC)", "Cell Ranger Aggregate + (RNA+ATAC)", "Cell Ranger Count (ATAC)", + or "Cell Ranger Aggregate (ATAC)". + 'sd:upstreamSource': "sc_atac_sample/atac_fragments_file" 'sd:localLabel': true dimensions: @@ -115,9 +121,9 @@ inputs: label: "Genes of interest" doc: | Comma or space separated list of genes - of interest to generate ATAC fragments coverage - plots. Ignored if "Cell Ranger ARC Sample" - input is not provided. + of interest to generate ATAC fragments + coverage plots. Ignored if "Cell Ranger + ATAC or RNA+ATAC" input is not provided. Default: None color_theme: @@ -428,7 +434,7 @@ steps: - stdout_log - stderr_log - pdf_plots: + folder_pdf_plots: run: ../tools/files-to-folder.cwl in: input_files: @@ -451,7 +457,7 @@ steps: compress_pdf_plots: run: ../tools/tar-compress.cwl in: - folder_to_compress: pdf_plots/folder + folder_to_compress: folder_pdf_plots/folder out: - compressed_folder diff --git a/workflows/sc-atac-dbinding.cwl b/workflows/sc-atac-dbinding.cwl index a4233a7e..7f880944 100644 --- a/workflows/sc-atac-dbinding.cwl +++ b/workflows/sc-atac-dbinding.cwl @@ -36,6 +36,7 @@ requirements: - "cellranger-atac-aggr.cwl" genome_indices: - "genome-indices.cwl" + - "https://github.com/datirium/workflows/workflows/genome-indices.cwl" inputs: @@ -62,12 +63,15 @@ inputs: type: File secondaryFiles: - .tbi - label: "Cell Ranger ATAC/ARC Count/Aggregate Experiment" - doc: | - Count and barcode information for every ATAC fragment - used in the loaded Seurat object. File should be saved - in TSV format with tbi-index file. - tbi-indexed. + label: "Cell Ranger ATAC or RNA+ATAC Sample" + doc: | + Any "Cell Ranger ATAC or RNA+ATAC Sample" + for loading chromatin accessibility data + from. This sample can be analyzed with + one of the following pipelines: "Cell + Ranger Count (RNA+ATAC)", "Cell Ranger + Aggregate (RNA+ATAC)", "Cell Ranger Count + (ATAC)", or "Cell Ranger Aggregate (ATAC)". 'sd:upstreamSource': "sc_atac_sample/atac_fragments_file" 'sd:localLabel': true @@ -697,7 +701,7 @@ steps: - stdout_log - stderr_log - pdf_plots: + folder_pdf_plots: run: ../tools/files-to-folder.cwl in: input_files: @@ -715,7 +719,7 @@ steps: compress_pdf_plots: run: ../tools/tar-compress.cwl in: - folder_to_compress: pdf_plots/folder + folder_to_compress: folder_pdf_plots/folder out: - compressed_folder diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index f24ffbbc..fe8ca1d8 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -560,7 +560,7 @@ steps: - stdout_log - stderr_log - pdf_plots: + folder_pdf_plots: run: ../tools/files-to-folder.cwl in: input_files: @@ -591,7 +591,7 @@ steps: compress_pdf_plots: run: ../tools/tar-compress.cwl in: - folder_to_compress: pdf_plots/folder + folder_to_compress: folder_pdf_plots/folder out: - compressed_folder diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index c9be7fc8..7eb3da55 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -31,9 +31,11 @@ requirements: - "sc-rna-cluster.cwl" - "sc-atac-cluster.cwl" - "sc-wnn-cluster.cwl" - sc_arc_sample: + sc_atac_sample: - "cellranger-arc-count.cwl" - "cellranger-arc-aggr.cwl" + - "cellranger-atac-count.cwl" + - "cellranger-atac-aggr.cwl" inputs: @@ -62,12 +64,17 @@ inputs: type: File? secondaryFiles: - .tbi - label: "Cell Ranger ARC Sample (optional)" - doc: | - "Cell Ranger ARC Sample" for generating - ATAC fragments coverage plots over the genes - of interest. - 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" + label: "Cell Ranger ATAC or RNA+ATAC Sample (optional)" + doc: | + Any "Cell Ranger ATAC or RNA+ATAC Sample" + for generating ATAC fragments coverage + plots over the genes of interest. This + sample can be analyzed with one of the + following pipelines: "Cell Ranger Count + (RNA+ATAC)", "Cell Ranger Aggregate + (RNA+ATAC)", "Cell Ranger Count (ATAC)", + or "Cell Ranger Aggregate (ATAC)". + 'sd:upstreamSource': "sc_atac_sample/atac_fragments_file" 'sd:localLabel': true query_reduction: @@ -155,8 +162,8 @@ inputs: Comma or space separated list of genes of interest to visualize expression and to generate ATAC fragments coverage plots. - Ignored if "Cell Ranger ARC Sample" input - is not provided. + Ignored if "Cell Ranger ATAC or RNA+ATAC + Sample" input is not provided. Default: None cell_type_data: @@ -736,7 +743,7 @@ steps: - stdout_log - stderr_log - pdf_plots: + folder_pdf_plots: run: ../tools/files-to-folder.cwl in: input_files: @@ -778,7 +785,7 @@ steps: compress_pdf_plots: run: ../tools/tar-compress.cwl in: - folder_to_compress: pdf_plots/folder + folder_to_compress: folder_pdf_plots/folder out: - compressed_folder diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index 31cbf2f2..e3669a6c 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -30,13 +30,20 @@ inputs: filtered_feature_bc_matrix_folder: type: File - label: "Cell Ranger ARC Sample" - doc: | - Any "Cell Ranger ARC Sample" that produces - compressed folder with feature-barcode - matrix in MEX format, ATAC fragments file - in TSV format, and optional aggregation + label: "Cell Ranger RNA+ATAC Sample" + doc: | + Any "Cell Ranger RNA+ATAC Sample" + that produces both gene expression + and chromatin accessibility data + in a form of a single compressed + feature-barcode matrix in a MEX + format, ATAC fragments file in TSV + format, and optional aggregation metadata file in TSV/CSV format. + This sample can be analyzed with + either "Cell Ranger Count (RNA+ATAC)" + or "Cell Ranger Aggregate (RNA+ATAC)" + pipeline. "sd:upstreamSource": "sc_arc_sample/filtered_feature_bc_matrix_folder" "sd:localLabel": true @@ -72,14 +79,14 @@ inputs: type: File? label: "Datasets grouping (optional)" doc: | - If the selected "Cell Ranger ARC + If the selected "Cell Ranger RNA+ATAC Sample" includes multiple aggregated datasets, each dataset can be assigned to a separate group by providing a TSV/CSV file with "library_id" and "condition" columns. Obtain this file from the "aggregation_metadata.csv" - output generated by "Cell Ranger ARC + output generated by "Cell Ranger RNA+ATAC Sample" and accessible on the "Files" tab. Remove all columns except the "library_id". Add the group names for @@ -98,9 +105,9 @@ inputs: one cell barcode per line. All other columns, except for "barcode", will be added to the single cell metadata loaded - from "Cell Ranger ARC Sample" and can be - utilized in the current or future steps - of analysis. + from "Cell Ranger RNA+ATAC Sample" and + can be utilized in the current or future + steps of analysis. call_by: type: string? @@ -115,7 +122,7 @@ inputs: any single cell metadata added through the "Selected cell barcodes (optional)" input. Default: use the original peaks - generated by Cell Ranger ARC. + generated by Cell Ranger RNA+ATAC Sample. 'sd:layout': advanced: true @@ -163,7 +170,7 @@ inputs: to exclude from the analysis all cells with the number of RNA reads smaller than the provided value. - If the selected "Cell Ranger ARC + If the selected "Cell Ranger RNA+ATAC Sample" includes multiple aggregated datasets, each of them can be filtered independently by providing comma or @@ -172,8 +179,9 @@ inputs: the specified values need to match with the datasets order from the "aggregation_metadata.csv" output - generated by "Cell Ranger ARC Sample" - and accessible on the "Files" tab. + generated by "Cell Ranger RNA+ATAC + Sample" and accessible on the "Files" + tab. Default: 500 "sd:layout": advanced: true @@ -187,7 +195,7 @@ inputs: to exclude from the analysis all cells with the number of expressed genes smaller than the provided value. - If the selected "Cell Ranger ARC + If the selected "Cell Ranger RNA+ATAC Sample" includes multiple aggregated datasets, each of them can be filtered independently by providing comma or @@ -196,8 +204,9 @@ inputs: the specified values need to match with the datasets order from the "aggregation_metadata.csv" output - generated by "Cell Ranger ARC Sample" - and accessible on the "Files" tab. + generated by "Cell Ranger RNA+ATAC + Sample" and accessible on the "Files" + tab. Default: 250 "sd:layout": advanced: true @@ -211,7 +220,7 @@ inputs: to exclude from the analysis all cells with the number of expressed genes bigger than the provided value. - If the selected "Cell Ranger ARC + If the selected "Cell Ranger RNA+ATAC Sample" includes multiple aggregated datasets, each of them can be filtered independently by providing comma or @@ -220,8 +229,9 @@ inputs: the specified values need to match with the datasets order from the "aggregation_metadata.csv" output - generated by "Cell Ranger ARC Sample" - and accessible on the "Files" tab. + generated by "Cell Ranger RNA+ATAC + Sample" and accessible on the "Files" + tab. Default: 5000 "sd:layout": advanced: true @@ -264,7 +274,7 @@ inputs: transcriptomic dissimilarity of the cells and is calculated as the ratio of log10(Genes) to log10(RNA UMI). - If the selected "Cell Ranger ARC + If the selected "Cell Ranger RNA+ATAC Sample" includes multiple aggregated datasets, each of them can be filtered independently by providing comma or @@ -273,9 +283,9 @@ inputs: the specified values need to match with the datasets order from the "aggregation_metadata.csv" output - generated by "Cell Ranger ARC Sample" - and accessible on the "Files" tab. - Default: 0.8 + generated by "Cell Ranger RNA+ATAC + Sample" and accessible on the "Files" + tab. Default: 0.8 "sd:layout": advanced: true @@ -289,16 +299,17 @@ inputs: cells with the number of ATAC fragments in peaks smaller than the provided value. If the selected "Cell Ranger - ARC Sample" includes multiple aggregated - datasets, each of them can be filtered - independently by providing comma or - space-separated list of filtering + RNA+ATAC Sample" includes multiple + aggregated datasets, each of them can + be filtered independently by providing + comma or space-separated list of filtering thresholds. The order and number of the specified values need to match with the datasets order from the "aggregation_metadata.csv" output - generated by "Cell Ranger ARC Sample" - and accessible on the "Files" tab. + generated by "Cell Ranger RNA+ATAC + Sample" and accessible on the "Files" + tab. Default: 1000 "sd:layout": advanced: true @@ -316,7 +327,7 @@ inputs: on the ratio of ATAC fragments centered at the genes TSS to ATAC fragments in the TSS-flanking regions. - If the selected "Cell Ranger ARC + If the selected "Cell Ranger RNA+ATAC Sample" includes multiple aggregated datasets, each of them can be filtered independently by providing comma or @@ -325,8 +336,9 @@ inputs: the specified values need to match with the datasets order from the "aggregation_metadata.csv" output - generated by "Cell Ranger ARC Sample" - and accessible on the "Files" tab. + generated by "Cell Ranger RNA+ATAC + Sample" and accessible on the "Files" + tab. Default: 2 "sd:layout": advanced: true @@ -341,7 +353,7 @@ inputs: cells with the FRiP (Fraction of Reads in Peaks) smaller than the provided value. - If the selected "Cell Ranger ARC + If the selected "Cell Ranger RNA+ATAC Sample" includes multiple aggregated datasets, each of them can be filtered independently by providing comma or @@ -350,8 +362,9 @@ inputs: the specified values need to match with the datasets order from the "aggregation_metadata.csv" output - generated by "Cell Ranger ARC Sample" - and accessible on the "Files" tab. + generated by "Cell Ranger RNA+ATAC + Sample" and accessible on the "Files" + tab. Default: 0.15 "sd:layout": advanced: true @@ -369,7 +382,7 @@ inputs: of nucleosome occupancy. It quantifies the approximate ratio of mononucleosomal to nucleosome-free ATAC fragments. - If the selected "Cell Ranger ARC + If the selected "Cell Ranger RNA+ATAC Sample" includes multiple aggregated datasets, each of them can be filtered independently by providing comma or @@ -378,8 +391,9 @@ inputs: the specified values need to match with the datasets order from the "aggregation_metadata.csv" output - generated by "Cell Ranger ARC Sample" - and accessible on the "Files" tab. + generated by "Cell Ranger RNA+ATAC + Sample" and accessible on the "Files" + tab. Default: 4 "sd:layout": advanced: true @@ -394,7 +408,7 @@ inputs: cells with the fraction of ATAC fragments in genomic blacklist regions bigger than the provided value. - If the selected "Cell Ranger ARC + If the selected "Cell Ranger RNA+ATAC Sample" includes multiple aggregated datasets, each of them can be filtered independently by providing comma or @@ -403,8 +417,9 @@ inputs: the specified values need to match with the datasets order from the "aggregation_metadata.csv" output - generated by "Cell Ranger ARC Sample" - and accessible on the "Files" tab. + generated by "Cell Ranger RNA+ATAC + Sample" and accessible on the "Files" + tab. Default: 0.05 "sd:layout": advanced: true @@ -1368,7 +1383,7 @@ steps: - stdout_log - stderr_log - pdf_plots: + folder_pdf_plots: run: ../tools/files-to-folder.cwl in: input_files: @@ -1434,7 +1449,7 @@ steps: compress_pdf_plots: run: ../tools/tar-compress.cwl in: - folder_to_compress: pdf_plots/folder + folder_to_compress: folder_pdf_plots/folder out: - compressed_folder diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index b13806e0..18560bec 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -535,7 +535,7 @@ steps: - stdout_log - stderr_log - pdf_plots: + folder_pdf_plots: run: ../tools/files-to-folder.cwl in: input_files: @@ -565,7 +565,7 @@ steps: compress_pdf_plots: run: ../tools/tar-compress.cwl in: - folder_to_compress: pdf_plots/folder + folder_to_compress: folder_pdf_plots/folder out: - compressed_folder diff --git a/workflows/sc-rna-da-cells.cwl b/workflows/sc-rna-da-cells.cwl index 9bb5a26a..598c8733 100644 --- a/workflows/sc-rna-da-cells.cwl +++ b/workflows/sc-rna-da-cells.cwl @@ -138,13 +138,13 @@ inputs: - "5" - "6" default: "1" - label: "Cores/CPUs" + label: "Number of cores/cpus to use" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. Default: 1 - "sd:layout": + 'sd:layout': advanced: true @@ -401,7 +401,7 @@ steps: - stdout_log - stderr_log - pdf_plots: + folder_pdf_plots: run: ../tools/files-to-folder.cwl in: input_files: @@ -425,7 +425,7 @@ steps: compress_pdf_plots: run: ../tools/tar-compress.cwl in: - folder_to_compress: pdf_plots/folder + folder_to_compress: folder_pdf_plots/folder out: - compressed_folder diff --git a/workflows/sc-rna-de-pseudobulk.cwl b/workflows/sc-rna-de-pseudobulk.cwl index 4a7e503e..344920de 100644 --- a/workflows/sc-rna-de-pseudobulk.cwl +++ b/workflows/sc-rna-de-pseudobulk.cwl @@ -47,9 +47,9 @@ inputs: Analysis that includes single-cell multiome RNA and ATAC-Seq or just RNA-Seq datasets run through either - "Single-cell Manual Cell Type - Assignment", "Single-cell RNA-Seq - Cluster Analysis", or "Single-cell + "Single-Cell Manual Cell Type + Assignment", "Single-Cell RNA-Seq + Cluster Analysis", or "Single-Cell WNN Cluster Analysis" at any of the processing stages. 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" @@ -684,7 +684,7 @@ steps: - stdout_log - stderr_log - pdf_plots: + folder_pdf_plots: run: ../tools/files-to-folder.cwl in: input_files: @@ -710,7 +710,7 @@ steps: compress_pdf_plots: run: ../tools/tar-compress.cwl in: - folder_to_compress: pdf_plots/folder + folder_to_compress: folder_pdf_plots/folder out: - compressed_folder diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index 933e3633..18d6f0c0 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -14,8 +14,8 @@ requirements: }; -'sd:upstream': - sc_rnaseq_sample: +"sd:upstream": + sc_rna_sample: - "cellranger-aggr.cwl" - "single-cell-preprocess-cellranger.cwl" - "cellranger-multi.cwl" @@ -26,153 +26,204 @@ inputs: alias: type: string - label: "Experiment short name/alias" + label: "Analysis name" sd:preview: position: 1 filtered_feature_bc_matrix_folder: type: File - label: "Cell Ranger Count/Aggregate Experiment" - doc: | - Path to the compressed folder with feature-barcode matrix from Cell Ranger Count/Aggregate - experiment in MEX format. - 'sd:upstreamSource': "sc_rnaseq_sample/filtered_feature_bc_matrix_folder" - 'sd:localLabel': true + label: "Cell Ranger RNA or RNA+VDJ Sample" + doc: | + Any "Cell Ranger RNA or RNA+VDJ Sample" + that produces gene expression data in + a form of compressed feature-barcode + matrix in a MEX format, optional annotated + V(D)J contigs data, and optional aggregation + metadata file in TSV/CSV format. This + sample can be analyzed with one of the + following pipelines: "Cell Ranger Count + (RNA)", "Cell Ranger Count (RNA+VDJ)", + or "Cell Ranger Aggregate (RNA, RNA+VDJ)" + "sd:upstreamSource": "sc_rna_sample/filtered_feature_bc_matrix_folder" + "sd:localLabel": true aggregation_metadata: type: File? - label: "Cell Ranger Count/Aggregate Experiment" - doc: | - Path to the metadata TSV/CSV file to set the datasets identities. If '--mex' points to - the Cell Ranger Aggregate outputs, the aggregation.csv file can be used. If input is not - provided, the default dummy_metadata.csv will be used instead. - 'sd:upstreamSource': "sc_rnaseq_sample/aggregation_metadata" - 'sd:localLabel': true + "sd:upstreamSource": "sc_rna_sample/aggregation_metadata" grouping_data: type: File? - label: "Optional TSV/CSV file to define datasets grouping with 'library_id' and 'condition' columns. Rows order should correspond to the aggregation metadata." - doc: | - Path to the TSV/CSV file to define datasets grouping. - First column - 'library_id' with the values and order - that correspond to the 'library_id' column from the ' - --identity' file, second column 'condition'. - Default: each dataset is assigned to its own group. + label: "Datasets grouping (optional)" + doc: | + If the selected "Cell Ranger RNA or + RNA+VDJ Sample" includes multiple + aggregated datasets, each dataset + can be assigned to a separate group + by providing a TSV/CSV file with + "library_id" and "condition" + columns. Obtain this file from + the "aggregation_metadata.csv" + output generated by "Cell Ranger + Aggregate (RNA, RNA+VDJ)" and + accessible on the "Files" tab. Remove + all columns except the "library_id". + Add the group names for each dataset + in a separate column named "condition". barcodes_data: type: File? - label: "Optional TSV/CSV file to prefilter and extend metadata be barcodes. First column should be named as 'barcode'" - doc: | - Path to the TSV/CSV file to optionally prefilter and - extend Seurat object metadata be selected barcodes. - First column should be named as 'barcode'. If file - includes any other columns they will be added to the - Seurat object metadata ovewriting the existing ones if - those are present. - Default: all cells used, no extra metadata is added + label: "Selected cell barcodes (optional)" + doc: | + A TSV/CSV file to optionally prefilter + the single cell data by including only + the cells with the selected barcodes. + The provided file should include at + least one column named "barcode", with + one cell barcode per line. All other + columns, except for "barcode", will be + added to the single cell metadata loaded + from "Cell Ranger RNA or RNA+VDJ Sample" + and can be utilized in the current or + future steps of analysis. - minimum_genes: - type: string? - default: "250" - label: "Include cells where at least this many genes are detected" + remove_doublets: + type: boolean? + default: false + label: "Remove doublets" doc: | - Include cells where at least this many genes are detected. If multiple values - provided, each of them will be applied to the correspondent dataset from the - '--mex' input based on the '--identity' file. - Default: 250 (applied to all datasets) - 'sd:layout': + Quality control filtering parameter + to remove cells identified as doublets. + Default: do not remove + "sd:layout": advanced: true - maximum_genes: + minimum_umis: type: string? - default: "5000" - label: "Include cells with the number of genes not bigger than this value" - doc: | - Include cells with the number of genes not bigger than this value. If multiple - values provided, each of them will be applied to the correspondent dataset from - the '--mex' input based on the '--identity' file. - Default: 5000 (applied to all datasets) - 'sd:layout': + default: "500" + label: "Minimum number of RNA reads per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the number of RNA reads + smaller than the provided value. + If the selected "Cell Ranger RNA or + RNA+VDJ Sample" includes multiple + aggregated datasets, each of them + can be filtered independently by + providing comma or space-separated + list of filtering thresholds. The + order and number of the specified + values need to match with the datasets + order from the "aggregation_metadata.csv" + output generated by "Cell Ranger RNA or + RNA+VDJ Sample" and accessible on the + "Files" tab. + Default: 500 + "sd:layout": advanced: true - minimum_umis: + minimum_genes: type: string? - default: "500" - label: "Include cells where at least this many RNA reads are detected" - doc: | - Include cells where at least this many RNA reads are detected. - If multiple values provided, each of them will be applied to the correspondent - dataset from the '--mex' input based on the '--identity' file. - Default: 500 (applied to all datasets) - 'sd:layout': + default: "250" + label: "Minimum number of genes per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the number of expressed + genes smaller than the provided value. + If the selected "Cell Ranger RNA or + RNA+VDJ Sample" includes multiple + aggregated datasets, each of them + can be filtered independently by + providing comma or space-separated + list of filtering thresholds. The + order and number of the specified + values need to match with the datasets + order from the "aggregation_metadata.csv" + output generated by "Cell Ranger RNA or + RNA+VDJ Sample" and accessible on the + "Files" tab. + Default: 250 + "sd:layout": advanced: true - minimum_novelty_score: + maximum_genes: type: string? - default: "0.8" - label: "Include cells with the novelty score not lower than this value, calculated as log10(genes)/log10(UMI)" - doc: | - Include cells with the novelty score not lower than this value, calculated - as log10(genes)/log10(UMI). If multiple values provided, each of them will - be applied to the correspondent dataset from the '--mex' input based on the - '--identity' file. - Default: 0.8 (applied to all datasets) - 'sd:layout': + default: "5000" + label: "Maximum number of genes per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the number of expressed + genes bigger than the provided value. + If the selected "Cell Ranger RNA or + RNA+VDJ Sample" includes multiple + aggregated datasets, each of them + can be filtered independently by + providing comma or space-separated + list of filtering thresholds. The + order and number of the specified + values need to match with the datasets + order from the "aggregation_metadata.csv" + output generated by "Cell Ranger RNA or + RNA+VDJ Sample" and accessible on the + "Files" tab. + Default: 5000 + "sd:layout": advanced: true mito_pattern: type: string? default: "^mt-|^MT-" - label: "Regex pattern to identify mitochondrial genes" + label: "Mitochondrial genes pattern" doc: | - Regex pattern to identify mitochondrial genes. - Default: '^mt-|^MT-' - 'sd:layout': + Regex pattern to identify mitochondrial + genes based on their names. + Default: "^mt-|^MT-" + "sd:layout": advanced: true maximum_mito_perc: type: float? default: 5 - label: "Include cells with the percentage of RNA reads mapped to mitochondrial genes not bigger than this value" - doc: | - Include cells with the percentage of RNA reads mapped to mitochondrial - genes not bigger than this value. - Default: 5 (applied to all datasets) - 'sd:layout': - advanced: true - - remove_doublets: - type: boolean? - default: false - label: "Remove cells that were identified as doublets" - doc: | - Remove cells that were identified as doublets. Cells with - RNA UMI < 200 will not be evaluated. Default: do not remove - doublets - 'sd:layout': - advanced: true - - rna_doublet_rate: - type: float? - default: null - label: "Expected RNA doublet rate" - doc: | - Expected RNA doublet rate. Default: 1 percent per - thousand cells captured with 10x genomics - 'sd:layout': + label: "Maximum mitochondrial percentage per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the percentage of RNA reads + mapped to mitochondrial genes exceeding + the provided value. + Default: 5 + "sd:layout": advanced: true - rna_doublet_rate_sd: - type: float? - default: null - label: "Uncertainty range in the RNA doublet rate" - doc: | - Uncertainty range in the RNA doublet rate, interpreted as - a +/- around the value provided in --rnadbr. Set to 0 to - disable. Set to 1 to make the threshold depend entirely - on the misclassification rate. Default: 40 percents of the - value provided in --rnadbr - 'sd:layout': + minimum_novelty_score: + type: string? + default: "0.8" + label: "Minimum novelty score per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the novelty scores + smaller than the provided value. + This QC metrics indicates the overall + transcriptomic dissimilarity of the + cells and is calculated as the ratio + of log10(Genes) to log10(RNA UMI). + If the selected "Cell Ranger RNA or + RNA+VDJ Sample" includes multiple + aggregated datasets, each of them + can be filtered independently by + providing comma or space-separated + list of filtering thresholds. The + order and number of the specified + values need to match with the datasets + order from the "aggregation_metadata.csv" + output generated by "Cell Ranger RNA or + RNA+VDJ Sample" and accessible on the + "Files" tab. + tab. Default: 0.8 + "sd:layout": advanced: true color_theme: @@ -194,7 +245,7 @@ inputs: Color theme for all generated plots. One of gray, bw, linedraw, light, dark, minimal, classic, void. Default: classic - 'sd:layout': + "sd:layout": advanced: true threads: @@ -215,7 +266,7 @@ inputs: number of cores/CPUs that can be utilized simultaneously. Default: 1 - 'sd:layout': + "sd:layout": advanced: true @@ -224,365 +275,372 @@ outputs: raw_1_2_qc_mtrcs_pca_plot_png: type: File? outputSource: sc_rna_filter/raw_1_2_qc_mtrcs_pca_plot_png - label: "PC1 and PC2 from the QC metrics PCA (not filtered)" + label: "QC metrics PCA (1,2), raw" doc: | - PC1 and PC2 from the QC metrics PCA (not filtered). - PNG format - 'sd:visualPlugins': + PC1 and PC2 from the QC metrics + PCA for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'PC1 and PC2 from the QC metrics PCA' + tab: "Raw" + Caption: "QC metrics PCA (1,2)" raw_2_3_qc_mtrcs_pca_plot_png: type: File? outputSource: sc_rna_filter/raw_2_3_qc_mtrcs_pca_plot_png - label: "PC2 and PC3 from the QC metrics PCA (not filtered)" + label: "QC metrics PCA (2,3), raw" doc: | - PC2 and PC3 from the QC metrics PCA (not filtered). - PNG format - 'sd:visualPlugins': + PC2 and PC3 from the QC metrics + PCA for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'PC2 and PC3 from the QC metrics PCA' + tab: "Raw" + Caption: "QC metrics PCA (2,3)" raw_cells_count_plot_png: type: File? outputSource: sc_rna_filter/raw_cells_count_plot_png - label: "Number of cells per dataset (not filtered)" + label: "Cells per dataset, raw" doc: | - Number of cells per dataset (not filtered). - PNG format - 'sd:visualPlugins': + Number of cells per dataset + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Number of cells per dataset' + tab: "Raw" + Caption: "Cells per dataset" raw_umi_dnst_plot_png: type: File? outputSource: sc_rna_filter/raw_umi_dnst_plot_png - label: "RNA reads per cell density (not filtered)" + label: "RNA reads per cell, raw" doc: | - RNA reads per cell density (not filtered). - PNG format - 'sd:visualPlugins': + RNA reads per cell density + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'RNA reads per cell density' + tab: "Raw" + Caption: "RNA reads per cell" raw_gene_dnst_plot_png: type: File? outputSource: sc_rna_filter/raw_gene_dnst_plot_png - label: "Genes per cell density (not filtered)" + label: "Genes per cell, raw" doc: | - Genes per cell density (not filtered). - PNG format - 'sd:visualPlugins': + Genes per cell density + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Genes per cell density' + tab: "Raw" + Caption: "Genes per cell" raw_gene_umi_plot_png: type: File? outputSource: sc_rna_filter/raw_gene_umi_plot_png - label: "Genes vs RNA reads per cell correlation (not filtered)" + label: "Genes vs RNA reads, raw" doc: | - Genes vs RNA reads per cell correlation (not filtered). - PNG format - 'sd:visualPlugins': + Genes vs RNA reads per cell + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Genes vs RNA reads per cell correlation' + tab: "Raw" + Caption: "Genes vs RNA reads" raw_mito_dnst_plot_png: type: File? outputSource: sc_rna_filter/raw_mito_dnst_plot_png - label: "Percentage of RNA reads mapped to mitochondrial genes per cell density (not filtered)" + label: "Mitochondrial percentage, raw" doc: | - Percentage of RNA reads mapped to mitochondrial genes per cell density (not filtered). - PNG format - 'sd:visualPlugins': + Percentage of RNA reads mapped to + mitochondrial genes per cell density + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Percentage of RNA reads mapped to mitochondrial genes per cell density' + tab: "Raw" + Caption: "Mitochondrial percentage" raw_nvlt_dnst_plot_png: type: File? outputSource: sc_rna_filter/raw_nvlt_dnst_plot_png - label: "Novelty score per cell density (not filtered)" + label: "Novelty score, raw" doc: | - Novelty score per cell density (not filtered). - PNG format - 'sd:visualPlugins': + Novelty score per cell density + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Novelty score per cell density' + tab: "Raw" + Caption: "Novelty score" raw_qc_mtrcs_dnst_plot_png: type: File? outputSource: sc_rna_filter/raw_qc_mtrcs_dnst_plot_png - label: "QC metrics per cell density (not filtered)" + label: "Main QC metrics, raw" doc: | - QC metrics per cell density (not filtered). - PNG format - 'sd:visualPlugins': + Main QC metrics per cell densities + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'QC metrics per cell density' + tab: "Raw" + Caption: "Main QC metrics" raw_rnadbl_plot_png: type: File? outputSource: sc_rna_filter/raw_rnadbl_plot_png - label: "Percentage of RNA doublets per dataset (not filtered)" + label: "RNA doublets, raw" doc: | - Percentage of RNA doublets per dataset (not filtered). - PNG format - 'sd:visualPlugins': + Percentage of RNA doublets per + dataset for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Percentage of RNA doublets per dataset' + tab: "Raw" + Caption: "RNA doublets" raw_umi_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/raw_umi_dnst_spl_cnd_plot_png - label: "Split by grouping condition RNA reads per cell density (not filtered)" + label: "RNA reads per cell, raw, split by condition" doc: | - Split by grouping condition RNA reads per cell density (not filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition RNA reads + per cell density for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Split by grouping condition RNA reads per cell density' + tab: "Raw, by condition" + Caption: "RNA reads per cell" raw_gene_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/raw_gene_dnst_spl_cnd_plot_png - label: "Split by grouping condition genes per cell density (not filtered)" + label: "Genes per cell, raw, split by condition" doc: | - Split by grouping condition genes per cell density (not filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition genes + per cell for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Split by grouping condition genes per cell density' + tab: "Raw, by condition" + Caption: "Genes per cell" raw_mito_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/raw_mito_dnst_spl_cnd_plot_png - label: "Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density (not filtered)" + label: "Mitochondrial percentage, raw, split by condition" doc: | - Split by grouping condition the percentage of RNA reads mapped - to mitochondrial genes per cell density (not filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition the + percentage of RNA reads mapped to + mitochondrial genes per cell density + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density' + tab: "Raw, by condition" + Caption: "Mitochondrial percentage" raw_nvlt_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/raw_nvlt_dnst_spl_cnd_plot_png - label: "Split by grouping condition the novelty score per cell density (not filtered)" + label: "Novelty score, raw, split by condition" doc: | - Split by grouping condition the novelty score per cell density (not filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition the + novelty score per cell density + for raw data + "sd:visualPlugins": - image: - tab: 'Not filtered QC' - Caption: 'Split by grouping condition the novelty score per cell density' + tab: "Raw, by condition" + Caption: "Novelty score" fltr_1_2_qc_mtrcs_pca_plot_png: type: File? outputSource: sc_rna_filter/fltr_1_2_qc_mtrcs_pca_plot_png - label: "PC1 and PC2 from the QC metrics PCA (filtered)" + label: "QC metrics PCA (1,2), filtered" doc: | - PC1 and PC2 from the QC metrics PCA (filtered). - PNG format - 'sd:visualPlugins': + PC1 and PC2 from the QC metrics + PCA for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'PC1 and PC2 from the QC metrics PCA' + tab: "Filtered" + Caption: "QC metrics PCA (1,2)" fltr_2_3_qc_mtrcs_pca_plot_png: type: File? outputSource: sc_rna_filter/fltr_2_3_qc_mtrcs_pca_plot_png - label: "PC2 and PC3 from the QC metrics PCA (filtered)" + label: "QC metrics PCA (2,3), filtered" doc: | - PC2 and PC3 from the QC metrics PCA (filtered). - PNG format - 'sd:visualPlugins': + PC2 and PC3 from the QC metrics + PCA for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'PC2 and PC3 from the QC metrics PCA' + tab: "Filtered" + Caption: "QC metrics PCA (2,3)" fltr_cells_count_plot_png: type: File? outputSource: sc_rna_filter/fltr_cells_count_plot_png - label: "Number of cells per dataset (filtered)" + label: "Cells per dataset, filtered" doc: | - Number of cells per dataset (filtered). - PNG format - 'sd:visualPlugins': + Number of cells per dataset + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Number of cells per dataset' + tab: "Filtered" + Caption: "Cells per dataset" fltr_umi_dnst_plot_png: type: File? outputSource: sc_rna_filter/fltr_umi_dnst_plot_png - label: "RNA reads per cell density (filtered)" + label: "RNA reads per cell, filtered" doc: | - RNA reads per cell density (filtered). - PNG format - 'sd:visualPlugins': + RNA reads per cell density + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'RNA reads per cell density' + tab: "Filtered" + Caption: "RNA reads per cell" fltr_gene_dnst_plot_png: type: File? outputSource: sc_rna_filter/fltr_gene_dnst_plot_png - label: "Genes per cell density (filtered)" + label: "Genes per cell, filtered" doc: | - Genes per cell density (filtered). - PNG format - 'sd:visualPlugins': + Genes per cell density + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Genes per cell density' + tab: "Filtered" + Caption: "Genes per cell" fltr_gene_umi_plot_png: type: File? outputSource: sc_rna_filter/fltr_gene_umi_plot_png - label: "Genes vs RNA reads per cell correlation (filtered)" + label: "Genes vs RNA reads, filtered" doc: | - Genes vs RNA reads per cell correlation (filtered). - PNG format - 'sd:visualPlugins': + Genes vs RNA reads per cell + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Genes vs RNA reads per cell correlation' + tab: "Filtered" + Caption: "Genes vs RNA reads" fltr_mito_dnst_plot_png: type: File? outputSource: sc_rna_filter/fltr_mito_dnst_plot_png - label: "Percentage of RNA reads mapped to mitochondrial genes per cell density (filtered)" + label: "Mitochondrial percentage, filtered" doc: | - Percentage of RNA reads mapped to mitochondrial genes per cell density (filtered). - PNG format - 'sd:visualPlugins': + Percentage of RNA reads mapped to + mitochondrial genes per cell density + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Percentage of RNA reads mapped to mitochondrial genes per cell density' + tab: "Filtered" + Caption: "Mitochondrial percentage" fltr_nvlt_dnst_plot_png: type: File? outputSource: sc_rna_filter/fltr_nvlt_dnst_plot_png - label: "Novelty score per cell density (filtered)" + label: "Novelty score, filtered" doc: | - Novelty score per cell density (filtered). - PNG format - 'sd:visualPlugins': + Novelty score per cell density + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Novelty score per cell density' + tab: "Filtered" + Caption: "Novelty score" fltr_qc_mtrcs_dnst_plot_png: type: File? outputSource: sc_rna_filter/fltr_qc_mtrcs_dnst_plot_png - label: "QC metrics per cell density (filtered)" + label: "Main QC metrics, filtered" doc: | - QC metrics per cell density (filtered). - PNG format - 'sd:visualPlugins': + Main QC metrics per cell densities + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'QC metrics per cell density' + tab: "Filtered" + Caption: "Main QC metrics" fltr_rnadbl_plot_png: type: File? outputSource: sc_rna_filter/fltr_rnadbl_plot_png - label: "Percentage of RNA doublets per dataset (filtered)" + label: "RNA doublets, filtered" doc: | - Percentage of RNA doublets per dataset (filtered). - PNG format - 'sd:visualPlugins': + Percentage of RNA doublets per + dataset for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Percentage of RNA doublets per dataset' + tab: "Filtered" + Caption: "RNA doublets" fltr_umi_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/fltr_umi_dnst_spl_cnd_plot_png - label: "Split by grouping condition RNA reads per cell density (filtered)" + label: "RNA reads per cell, filtered, split by condition" doc: | - Split by grouping condition RNA reads per cell density (filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition RNA reads + per cell density for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Split by grouping condition RNA reads per cell density' + tab: "Filtered, by condition" + Caption: "RNA reads per cell" fltr_gene_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/fltr_gene_dnst_spl_cnd_plot_png - label: "Split by grouping condition genes per cell density (filtered)" + label: "Genes per cell, filtered, split by condition" doc: | - Split by grouping condition genes per cell density (filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition genes + per cell for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Split by grouping condition genes per cell density' + tab: "Filtered, by condition" + Caption: "Genes per cell" fltr_mito_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/fltr_mito_dnst_spl_cnd_plot_png - label: "Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density (filtered)" + label: "Mitochondrial percentage, filtered, split by condition" doc: | - Split by grouping condition the percentage of RNA reads mapped - to mitochondrial genes per cell density (filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition the + percentage of RNA reads mapped to + mitochondrial genes per cell density + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Split by grouping condition the percentage of RNA reads mapped to mitochondrial genes per cell density' + tab: "Filtered, by condition" + Caption: "Mitochondrial percentage" fltr_nvlt_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/fltr_nvlt_dnst_spl_cnd_plot_png - label: "Split by grouping condition the novelty score per cell density (filtered)" + label: "Novelty score, filtered, split by condition" doc: | - Split by grouping condition the novelty score per cell density (filtered). - PNG format - 'sd:visualPlugins': + Split by grouping condition the + novelty score per cell density + for filtered data + "sd:visualPlugins": - image: - tab: 'Filtered QC' - Caption: 'Split by grouping condition the novelty score per cell density' + tab: "Filtered, by condition" + Caption: "Novelty score" ucsc_cb_html_data: type: Directory outputSource: sc_rna_filter/ucsc_cb_html_data - label: "Directory with UCSC Cellbrowser html data" + label: "UCSC Cell Browser data" doc: | - Directory with UCSC Cellbrowser html data. + Directory with UCSC Cell Browser + data ucsc_cb_html_file: type: File outputSource: sc_rna_filter/ucsc_cb_html_file - label: "Open in UCSC Cell Browser" + label: "UCSC Cell Browser" doc: | - HTML index file from the directory with UCSC Cellbrowser html data. - 'sd:visualPlugins': + UCSC Cell Browser HTML index file + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" seurat_data_rds: type: File outputSource: sc_rna_filter/seurat_data_rds - label: "Processed Seurat data in RDS format" + label: "Processed seurat data in RDS format" doc: | - Processed Seurat data in RDS format + Processed seurat data in RDS format datasets_metadata: type: File @@ -603,16 +661,18 @@ outputs: sc_rna_filter_stdout_log: type: File outputSource: sc_rna_filter/stdout_log - label: "stdout log generated by sc_rna_filter step" + label: "Output log, filtering step" doc: | - stdout log generated by sc_rna_filter step + stdout log generated by + sc_rna_filter step sc_rna_filter_stderr_log: type: File outputSource: sc_rna_filter/stderr_log - label: "stderr log generated by sc_rna_filter step" + label: "Error log, filtering step" doc: | - stderr log generated by sc_rna_filter step + stderr log generated by + sc_rna_filter step steps: @@ -652,12 +712,6 @@ steps: mito_pattern: mito_pattern maximum_mito_perc: maximum_mito_perc remove_doublets: remove_doublets - rna_doublet_rate: - source: rna_doublet_rate - valueFrom: $(self==""?null:self) # safety measure - rna_doublet_rate_sd: - source: rna_doublet_rate_sd - valueFrom: $(self==""?null:self) # safety measure verbose: default: true export_ucsc_cb: @@ -736,7 +790,7 @@ steps: - stdout_log - stderr_log - pdf_plots: + folder_pdf_plots: run: ../tools/files-to-folder.cwl in: input_files: @@ -778,7 +832,7 @@ steps: compress_pdf_plots: run: ../tools/tar-compress.cwl in: - folder_to_compress: pdf_plots/folder + folder_to_compress: folder_pdf_plots/folder out: - compressed_folder diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 023a84be..8216bf8b 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -661,7 +661,7 @@ steps: - stdout_log - stderr_log - pdf_plots: + folder_pdf_plots: run: ../tools/files-to-folder.cwl in: input_files: @@ -692,7 +692,7 @@ steps: compress_pdf_plots: run: ../tools/tar-compress.cwl in: - folder_to_compress: pdf_plots/folder + folder_to_compress: folder_pdf_plots/folder out: - compressed_folder diff --git a/workflows/sc-rna-trajectory.cwl b/workflows/sc-rna-trajectory.cwl index 50f96b25..fef0f119 100644 --- a/workflows/sc-rna-trajectory.cwl +++ b/workflows/sc-rna-trajectory.cwl @@ -551,7 +551,7 @@ steps: - stdout_log - stderr_log - pdf_plots: + folder_pdf_plots: run: ../tools/files-to-folder.cwl in: input_files: @@ -587,7 +587,7 @@ steps: compress_pdf_plots: run: ../tools/tar-compress.cwl in: - folder_to_compress: pdf_plots/folder + folder_to_compress: folder_pdf_plots/folder out: - compressed_folder diff --git a/workflows/sc-triangulate.cwl b/workflows/sc-triangulate.cwl index 4febb1d8..bc62c44b 100644 --- a/workflows/sc-triangulate.cwl +++ b/workflows/sc-triangulate.cwl @@ -114,13 +114,13 @@ inputs: - "5" - "6" default: "1" - label: "Cores/CPUs" + label: "Number of cores/cpus to use" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. Default: 1 - "sd:layout": + 'sd:layout': advanced: true @@ -331,7 +331,7 @@ steps: - stdout_log - stderr_log - pdf_plots: + folder_pdf_plots: run: ../tools/files-to-folder.cwl in: input_files: @@ -354,7 +354,7 @@ steps: compress_pdf_plots: run: ../tools/tar-compress.cwl in: - folder_to_compress: pdf_plots/folder + folder_to_compress: folder_pdf_plots/folder out: - compressed_folder diff --git a/workflows/sc-vdj-profile.cwl b/workflows/sc-vdj-profile.cwl index 56cc802e..b256c8c1 100644 --- a/workflows/sc-vdj-profile.cwl +++ b/workflows/sc-vdj-profile.cwl @@ -41,13 +41,15 @@ inputs: contigs_data: type: File - label: "Cell Ranger Immune Profiling Sample" - doc: | - "Cell Ranger Multi Gene Expression and - V(D)J Repertoire Profiling" or "Cell - Ranger Aggregate" sample to load high - level annotations of each high-confidence - contig from the cell-associated barcodes + label: "Cell Ranger RNA+VDJ Sample" + doc: | + Any "Cell Ranger RNA+VDJ Sample" to + load high level annotations of each + high-confidence contig from the + cell-associated barcodes. This sample + can be analyzed with either "Cell + Ranger Count (RNA+VDJ)" or "Cell Ranger + Aggregate (RNA, RNA+VDJ)" pipeline. 'sd:upstreamSource': "sc_vdj_sample/filtered_contig_annotations_csv" 'sd:localLabel': true @@ -585,7 +587,7 @@ steps: - stdout_log - stderr_log - pdf_plots: + folder_pdf_plots: run: ../tools/files-to-folder.cwl in: input_files: @@ -620,7 +622,7 @@ steps: compress_pdf_plots: run: ../tools/tar-compress.cwl in: - folder_to_compress: pdf_plots/folder + folder_to_compress: folder_pdf_plots/folder out: - compressed_folder diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index 0355033c..c0cea858 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -59,11 +59,14 @@ inputs: type: File? secondaryFiles: - .tbi - label: "Cell Ranger ARC Sample (optional)" - doc: | - "Cell Ranger ARC Sample" for generating - ATAC fragments coverage plots over the genes - of interest. + label: "Cell Ranger RNA+ATAC Sample (optional)" + doc: | + Any "Cell Ranger ATAC or RNA+ATAC Sample" + for generating ATAC fragments coverage + plots over the genes of interest. This + sample can be analyzed with either + "Cell Ranger Count (RNA+ATAC)" or "Cell + Ranger Aggregate (RNA+ATAC)" pipeline 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" 'sd:localLabel': true @@ -149,8 +152,8 @@ inputs: Comma or space separated list of genes of interest to visualize expression and to generate ATAC fragments coverage plots. - Ignored if "Cell Ranger ARC Sample" input - is not provided. + Ignored if "Cell Ranger RNA+ATAC Sample" + input is not provided. Default: None color_theme: @@ -606,7 +609,7 @@ steps: - stdout_log - stderr_log - pdf_plots: + folder_pdf_plots: run: ../tools/files-to-folder.cwl in: input_files: @@ -636,7 +639,7 @@ steps: compress_pdf_plots: run: ../tools/tar-compress.cwl in: - folder_to_compress: pdf_plots/folder + folder_to_compress: folder_pdf_plots/folder out: - compressed_folder From 4bdd61ae2a4a312243a2b7af0842d5b135deb797 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 22 Jan 2024 19:17:49 -0500 Subject: [PATCH 107/162] Mistakenly included extra upstreams that are not needed --- workflows/diffbind-multi-factor.cwl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/workflows/diffbind-multi-factor.cwl b/workflows/diffbind-multi-factor.cwl index 19a1d65b..78b0e6ac 100644 --- a/workflows/diffbind-multi-factor.cwl +++ b/workflows/diffbind-multi-factor.cwl @@ -22,13 +22,8 @@ requirements: - "trim-chipseq-pe.cwl" - "trim-atacseq-se.cwl" - "trim-atacseq-pe.cwl" - - "https://github.com/datirium/workflows/workflows/trim-chipseq-se.cwl" - - "https://github.com/datirium/workflows/workflows/trim-chipseq-pe.cwl" - - "https://github.com/datirium/workflows/workflows/trim-atacseq-se.cwl" - - "https://github.com/datirium/workflows/workflows/trim-atacseq-pe.cwl" genome_indices: - "genome-indices.cwl" - - "https://github.com/datirium/workflows/workflows/genome-indices.cwl" inputs: From eff0ef555e6210c8c6122f0fcbce4d67a84ba4bf Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 22 Jan 2024 19:42:05 -0500 Subject: [PATCH 108/162] Minor correction in the inputs' docs --- workflows/sc-atac-cluster.cwl | 5 +++-- workflows/sc-atac-coverage.cwl | 14 +++++++++----- workflows/sc-atac-dbinding.cwl | 2 +- workflows/sc-ctype-assign.cwl | 4 ++-- workflows/sc-multiome-filter.cwl | 2 +- workflows/sc-rna-filter.cwl | 2 +- workflows/sc-vdj-profile.cwl | 2 +- workflows/sc-wnn-cluster.cwl | 6 +++--- 8 files changed, 21 insertions(+), 16 deletions(-) diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index 74836d81..c7e43413 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -63,7 +63,7 @@ inputs: Any "Cell Ranger ATAC or RNA+ATAC Sample" for generating ATAC fragments coverage plots over the genes of interest. This - sample can be analyzed with one of the + sample can be obtained from one of the following pipelines: "Cell Ranger Count (RNA+ATAC)", "Cell Ranger Aggregate (RNA+ATAC)", "Cell Ranger Count (ATAC)", @@ -123,7 +123,8 @@ inputs: Comma or space separated list of genes of interest to generate ATAC fragments coverage plots. Ignored if "Cell Ranger - ATAC or RNA+ATAC" input is not provided. + ATAC or RNA+ATAC (optional)" input is + not provided. Default: None color_theme: diff --git a/workflows/sc-atac-coverage.cwl b/workflows/sc-atac-coverage.cwl index c71350fe..5d2ba00c 100644 --- a/workflows/sc-atac-coverage.cwl +++ b/workflows/sc-atac-coverage.cwl @@ -31,7 +31,7 @@ requirements: - "cellranger-atac-aggr.cwl" genome_indices: - "genome-indices.cwl" - + - "https://github.com/datirium/workflows/workflows/genome-indices.cwl" inputs: @@ -55,11 +55,15 @@ inputs: type: File secondaryFiles: - .tbi - label: "Cell Ranger ATAC/ARC Count/Aggregate Experiment" + label: "Cell Ranger ATAC or RNA+ATAC Sample" doc: | - Count and barcode information for every ATAC fragment used in the - loaded Seurat object. File should be saved in TSV format and to be - tbi-indexed. + Any "Cell Ranger ATAC or RNA+ATAC Sample" + for generating ATAC fragments coverage + files. This sample can be obtained from + one of the following pipelines: "Cell + Ranger Count (RNA+ATAC)", "Cell Ranger + Aggregate (RNA+ATAC)", "Cell Ranger Count + (ATAC)", or "Cell Ranger Aggregate (ATAC)". 'sd:upstreamSource': "sc_atac_sample/atac_fragments_file" 'sd:localLabel': true diff --git a/workflows/sc-atac-dbinding.cwl b/workflows/sc-atac-dbinding.cwl index 7f880944..ab1782ba 100644 --- a/workflows/sc-atac-dbinding.cwl +++ b/workflows/sc-atac-dbinding.cwl @@ -67,7 +67,7 @@ inputs: doc: | Any "Cell Ranger ATAC or RNA+ATAC Sample" for loading chromatin accessibility data - from. This sample can be analyzed with + from. This sample can be obtained from one of the following pipelines: "Cell Ranger Count (RNA+ATAC)", "Cell Ranger Aggregate (RNA+ATAC)", "Cell Ranger Count diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index 7eb3da55..88095fe1 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -69,7 +69,7 @@ inputs: Any "Cell Ranger ATAC or RNA+ATAC Sample" for generating ATAC fragments coverage plots over the genes of interest. This - sample can be analyzed with one of the + sample can be obtained from one of the following pipelines: "Cell Ranger Count (RNA+ATAC)", "Cell Ranger Aggregate (RNA+ATAC)", "Cell Ranger Count (ATAC)", @@ -163,7 +163,7 @@ inputs: of interest to visualize expression and to generate ATAC fragments coverage plots. Ignored if "Cell Ranger ATAC or RNA+ATAC - Sample" input is not provided. + Sample (optional)" input is not provided. Default: None cell_type_data: diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index e3669a6c..3bdb8f9c 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -40,7 +40,7 @@ inputs: format, ATAC fragments file in TSV format, and optional aggregation metadata file in TSV/CSV format. - This sample can be analyzed with + This sample can be obtained from either "Cell Ranger Count (RNA+ATAC)" or "Cell Ranger Aggregate (RNA+ATAC)" pipeline. diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index 18d6f0c0..05f278a6 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -40,7 +40,7 @@ inputs: matrix in a MEX format, optional annotated V(D)J contigs data, and optional aggregation metadata file in TSV/CSV format. This - sample can be analyzed with one of the + sample can be obtained from one of the following pipelines: "Cell Ranger Count (RNA)", "Cell Ranger Count (RNA+VDJ)", or "Cell Ranger Aggregate (RNA, RNA+VDJ)" diff --git a/workflows/sc-vdj-profile.cwl b/workflows/sc-vdj-profile.cwl index b256c8c1..efb4c97b 100644 --- a/workflows/sc-vdj-profile.cwl +++ b/workflows/sc-vdj-profile.cwl @@ -47,7 +47,7 @@ inputs: load high level annotations of each high-confidence contig from the cell-associated barcodes. This sample - can be analyzed with either "Cell + can be obtained from either "Cell Ranger Count (RNA+VDJ)" or "Cell Ranger Aggregate (RNA, RNA+VDJ)" pipeline. 'sd:upstreamSource': "sc_vdj_sample/filtered_contig_annotations_csv" diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index c0cea858..88c37752 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -64,7 +64,7 @@ inputs: Any "Cell Ranger ATAC or RNA+ATAC Sample" for generating ATAC fragments coverage plots over the genes of interest. This - sample can be analyzed with either + sample can be obtained from either "Cell Ranger Count (RNA+ATAC)" or "Cell Ranger Aggregate (RNA+ATAC)" pipeline 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" @@ -152,8 +152,8 @@ inputs: Comma or space separated list of genes of interest to visualize expression and to generate ATAC fragments coverage plots. - Ignored if "Cell Ranger RNA+ATAC Sample" - input is not provided. + Ignored if "Cell Ranger RNA+ATAC Sample + (optional)" input is not provided. Default: None color_theme: From 3a90343d18a546df6532984d2a5daa27b8b49c26 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Sat, 27 Jan 2024 16:34:28 -0500 Subject: [PATCH 109/162] Refactoring cell ranger count pipeline Some changes in other cell ranger pipelines --- workflows/cellranger-arc-aggr.cwl | 11 +- workflows/cellranger-arc-count.cwl | 93 +++-- workflows/cellranger-atac-aggr.cwl | 64 ++-- workflows/cellranger-atac-count.cwl | 126 ++++--- workflows/cellranger-mkref.cwl | 53 ++- workflows/cellranger-multi.cwl | 70 ++-- .../single-cell-preprocess-cellranger.cwl | 355 +++++++++--------- 7 files changed, 430 insertions(+), 342 deletions(-) diff --git a/workflows/cellranger-arc-aggr.cwl b/workflows/cellranger-arc-aggr.cwl index c51e57a7..1ffe4a37 100644 --- a/workflows/cellranger-arc-aggr.cwl +++ b/workflows/cellranger-arc-aggr.cwl @@ -51,10 +51,15 @@ inputs: indices_folder: type: Directory - label: "Genome type" + label: "Cell Ranger Reference Sample" doc: | - Reference genome package created - with cellranger-arc mkref command. + Any "Cell Ranger Reference Sample" that + builds a reference genome package of a + selected species for quantifying gene + expression and chromatin accessibility. + This sample can be obtained from "Cell + Ranger Reference (RNA, ATAC, RNA+ATAC)" + pipeline. 'sd:upstreamSource': "genome_indices/arc_indices_folder" 'sd:localLabel': true diff --git a/workflows/cellranger-arc-count.cwl b/workflows/cellranger-arc-count.cwl index f25d05ec..e15b3b9d 100644 --- a/workflows/cellranger-arc-count.cwl +++ b/workflows/cellranger-arc-count.cwl @@ -24,10 +24,15 @@ inputs: indices_folder: type: Directory - label: "Genome type" - doc: | - Reference genome package created - with cellranger-arc mkref command. + label: "Cell Ranger Reference Sample" + doc: | + Any "Cell Ranger Reference Sample" that + builds a reference genome package of a + selected species for quantifying gene + expression and chromatin accessibility. + This sample can be obtained from "Cell + Ranger Reference (RNA, ATAC, RNA+ATAC)" + pipeline. 'sd:upstreamSource': "genome_indices/arc_indices_folder" 'sd:localLabel': true @@ -41,7 +46,6 @@ inputs: - File - type: array items: File - format: "http://edamontology.org/format_1930" label: "RNA FASTQ, Read 1" doc: | Optionally compressed FASTQ file @@ -55,7 +59,6 @@ inputs: - File - type: array items: File - format: "http://edamontology.org/format_1930" label: "RNA FASTQ, Read 2" doc: | Optionally compressed FASTQ file @@ -68,7 +71,6 @@ inputs: - File - type: array items: File - format: "http://edamontology.org/format_1930" label: "ATAC FASTQ, Read 1" doc: | Optionally compressed FASTQ file @@ -82,7 +84,6 @@ inputs: - File - type: array items: File - format: "http://edamontology.org/format_1930" label: "ATAC FASTQ, Read 2" doc: | Optionally compressed FASTQ file @@ -96,7 +97,6 @@ inputs: - File - type: array items: File - format: "http://edamontology.org/format_1930" label: "ATAC FASTQ, Read 3" doc: | Optionally compressed FASTQ file @@ -252,26 +252,42 @@ outputs: gex_possorted_genome_bam_bai: type: File outputSource: generate_counts_matrix/gex_possorted_genome_bam_bai - label: "RNA position-sorted alignments" - doc: | - Position-sorted and indexed BAM file - of RNA read alignments to the genome - and transcriptome. Each read in this - BAM file has a 10x Chromium cellular - (associated with a 10x Genomics gel - bead) barcode and molecular barcode - information attached. + label: "RNA reads" + doc: | + Genome track of RNA reads aligned to + the reference genome. Each read has + a 10x Chromium cellular (associated + with a 10x Genomics gel bead) barcode + and molecular barcode information + attached. + "sd:visualPlugins": + - igvbrowser: + tab: "IGV Genome Browser" + id: "igvbrowser" + type: "alignment" + format: "bam" + name: "RNA reads" + displayMode: "SQUISHED" atac_possorted_genome_bam_bai: type: File outputSource: generate_counts_matrix/atac_possorted_genome_bam_bai - label: "ATAC position-sorted alignments" - doc: | - Position-sorted and indexed BAM file - for the Chromatin Accessibility - library. Chromium cellular barcode - and mapping information for each read - is stored as TAG fields. + label: "ATAC reads" + doc: | + Genome track of ATAC reads aligned to + the reference genome. Each read has + a 10x Chromium cellular (associated + with a 10x Genomics gel bead) barcode + and mapping information stored in TAG + fields. + "sd:visualPlugins": + - igvbrowser: + tab: "IGV Genome Browser" + id: "igvbrowser" + type: "alignment" + format: "bam" + name: "ATAC reads" + displayMode: "SQUISHED" filtered_feature_bc_matrix_folder: type: File @@ -373,24 +389,31 @@ outputs: outputSource: generate_counts_matrix/atac_peaks_bed_file label: "ATAC peaks" doc: | - Locations of open-chromatin regions - identified in this sample. These - regions are referred to as "peaks". + Genome track of open-chromatin + regions identified as peaks. + "sd:visualPlugins": + - igvbrowser: + tab: "IGV Genome Browser" + id: "igvbrowser" + type: "annotation" + name: "ATAC peaks" + displayMode: "COLLAPSE" + height: 40 atac_cut_sites_bigwig_file: type: File outputSource: generate_counts_matrix/atac_cut_sites_bigwig_file - label: "ATAC cut sites" + label: "ATAC transposition counts" doc: | Genome track of observed transposition sites in the experiment smoothed at a - resolution of 400 bases in bigWig format. - 'sd:visualPlugins': + resolution of 400 bases. + "sd:visualPlugins": - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - type: 'wig' - name: "ATAC cut sites" + tab: "IGV Genome Browser" + id: "igvbrowser" + type: "wig" + name: "ATAC transposition counts" height: 120 atac_peak_annotation_file: diff --git a/workflows/cellranger-atac-aggr.cwl b/workflows/cellranger-atac-aggr.cwl index fbed12f1..6276d8c0 100644 --- a/workflows/cellranger-atac-aggr.cwl +++ b/workflows/cellranger-atac-aggr.cwl @@ -55,11 +55,23 @@ inputs: indices_folder: type: Directory - label: "Genome Type" - doc: "Cell Ranger ARC generated genome indices folder" + label: "Cell Ranger Reference Sample" + doc: | + Any "Cell Ranger Reference Sample" that + builds a reference genome package of a + selected species for quantifying gene + expression and chromatin accessibility. + This sample can be obtained from "Cell + Ranger Reference (RNA, ATAC, RNA+ATAC)" + pipeline. 'sd:upstreamSource': "genome_indices/arc_indices_folder" 'sd:localLabel': true + memory_limit: + type: int? + default: 20 + 'sd:upstreamSource': "genome_indices/memory_limit" + normalization_mode: type: - "null" @@ -72,18 +84,23 @@ inputs: advanced: true threads: - type: int? - default: 4 - label: "Number of threads" - doc: "Number of threads for those steps that support multithreading" - 'sd:layout': - advanced: true - - memory_limit: - type: int? - default: 20 - label: "Maximum memory used (GB)" - doc: "Maximum memory used (GB). The same will be applied to virtual memory" + type: + - "null" + - type: enum + symbols: + - "1" + - "2" + - "3" + - "4" + - "5" + - "6" + default: "4" + label: "Cores/CPUs" + doc: | + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 4 'sd:layout': advanced: true @@ -213,13 +230,6 @@ outputs: doc: | stderr log generated by cellranger-atac aggr - compressed_html_data_folder: - type: File - outputSource: compress_html_data_folder/compressed_folder - label: "Compressed folder with CellBrowser formatted results" - doc: | - Compressed folder with CellBrowser formatted results - html_data_folder: type: Directory outputSource: cellbrowser_build/html_data @@ -249,7 +259,9 @@ steps: gem_well_labels: gem_well_labels indices_folder: indices_folder normalization_mode: normalization_mode - threads: threads + threads: + source: threads + valueFrom: $(parseInt(self)) memory_limit: memory_limit virt_memory_limit: memory_limit out: @@ -301,14 +313,6 @@ steps: - html_data - index_html_file - compress_html_data_folder: - run: ../tools/tar-compress.cwl - in: - folder_to_compress: cellbrowser_build/html_data - out: - - compressed_folder - - $namespaces: s: http://schema.org/ diff --git a/workflows/cellranger-atac-count.cwl b/workflows/cellranger-atac-count.cwl index eb81ae82..b2e97921 100644 --- a/workflows/cellranger-atac-count.cwl +++ b/workflows/cellranger-atac-count.cwl @@ -24,11 +24,23 @@ inputs: indices_folder: type: Directory - label: "Genome Type" - doc: "Cell Ranger ARC generated genome indices folder" + label: "Cell Ranger Reference Sample" + doc: | + Any "Cell Ranger Reference Sample" that + builds a reference genome package of a + selected species for quantifying gene + expression and chromatin accessibility. + This sample can be obtained from "Cell + Ranger Reference (RNA, ATAC, RNA+ATAC)" + pipeline. 'sd:upstreamSource': "genome_indices/arc_indices_folder" 'sd:localLabel': true + memory_limit: + type: int? + default: 20 + 'sd:upstreamSource': "genome_indices/memory_limit" + fastq_file_r1: type: - File @@ -65,24 +77,26 @@ inputs: advanced: true threads: - type: int? - default: 4 - label: "Number of threads" - doc: "Number of threads for those steps that support multithreading" + type: + - "null" + - type: enum + symbols: + - "1" + - "2" + - "3" + - "4" + - "5" + - "6" + default: "4" + label: "Cores/CPUs" + doc: | + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 4 'sd:layout': advanced: true - memory_limit: - type: int? - default: 20 - label: "Genome Type" - doc: | - Maximum memory used (GB). - The same as was used for generating indices. - The same will be applied to virtual memory - 'sd:upstreamSource': "genome_indices/memory_limit" - 'sd:localLabel': true - outputs: @@ -154,10 +168,22 @@ outputs: possorted_genome_bam_bai: type: File outputSource: generate_counts_matrix/possorted_genome_bam_bai - label: "Aligned to the genome indexed reads BAM+BAI files" + label: "ATAC reads" doc: | - Indexed position-sorted reads aligned to the genome annotated - with barcode information in BAM format + Genome track of ATAC reads aligned to + the reference genome. Each read has + a 10x Chromium cellular (associated + with a 10x Genomics gel bead) barcode + and mapping information stored in TAG + fields. + "sd:visualPlugins": + - igvbrowser: + tab: "IGV Genome Browser" + id: "igvbrowser" + type: "alignment" + format: "bam" + name: "ATAC reads" + displayMode: "SQUISHED" atac_fragments_file: type: File @@ -170,10 +196,18 @@ outputs: peaks_bed_file: type: File outputSource: generate_counts_matrix/peaks_bed_file - label: "Identified peaks in BED format" + label: "ATAC peaks" doc: | - Locations of open-chromatin regions identified in the - experiment (these regions are referred to as "peaks") + Genome track of open-chromatin + regions identified as peaks. + "sd:visualPlugins": + - igvbrowser: + tab: "IGV Genome Browser" + id: "igvbrowser" + type: "annotation" + name: "ATAC peaks" + displayMode: "COLLAPSE" + height: 40 peak_annotation_file: type: File @@ -185,15 +219,17 @@ outputs: cut_sites_bigwig_file: type: File outputSource: generate_counts_matrix/cut_sites_bigwig_file - label: "Smoothed transposition site track in bigWig format" + label: "ATAC transposition counts" doc: | - Smoothed transposition site track in bigWig format - 'sd:visualPlugins': + Genome track of observed transposition + sites in the experiment smoothed at a + resolution of 400 bases. + "sd:visualPlugins": - igvbrowser: - tab: 'IGV Genome Browser' - id: 'igvbrowser' - type: 'wig' - name: "ATAC cut sites" + tab: "IGV Genome Browser" + id: "igvbrowser" + type: "wig" + name: "ATAC transposition counts" height: 120 # peak_motif_mapping_bed: @@ -304,13 +340,6 @@ outputs: vertical: true tab: 'Overview' - compressed_html_data_folder: - type: File - outputSource: compress_html_data_folder/compressed_folder - label: "Compressed folder with CellBrowser formatted results" - doc: | - Compressed folder with CellBrowser formatted results - html_data_folder: type: Directory outputSource: cellbrowser_build/html_data @@ -363,7 +392,9 @@ steps: run: ../tools/fastqc.cwl in: reads_file: extract_fastq_r1/fastq_file - threads: threads + threads: + source: threads + valueFrom: $(parseInt(self)) out: - html_file @@ -371,7 +402,9 @@ steps: run: ../tools/fastqc.cwl in: reads_file: extract_fastq_r2/fastq_file - threads: threads + threads: + source: threads + valueFrom: $(parseInt(self)) out: - html_file @@ -379,7 +412,9 @@ steps: run: ../tools/fastqc.cwl in: reads_file: extract_fastq_r3/fastq_file - threads: threads + threads: + source: threads + valueFrom: $(parseInt(self)) out: - html_file @@ -391,7 +426,9 @@ steps: fastq_file_r3: extract_fastq_r3/fastq_file indices_folder: indices_folder force_cells: force_cells - threads: threads + threads: + source: threads + valueFrom: $(parseInt(self)) memory_limit: memory_limit virt_memory_limit: memory_limit out: @@ -462,13 +499,6 @@ steps: - html_data - index_html_file - compress_html_data_folder: - run: ../tools/tar-compress.cwl - in: - folder_to_compress: cellbrowser_build/html_data - out: - - compressed_folder - $namespaces: s: http://schema.org/ diff --git a/workflows/cellranger-mkref.cwl b/workflows/cellranger-mkref.cwl index 297f7eb3..73359ee8 100644 --- a/workflows/cellranger-mkref.cwl +++ b/workflows/cellranger-mkref.cwl @@ -18,39 +18,52 @@ inputs: alias: type: string - label: "Experiment short name/Alias" + label: "Analysis name" sd:preview: position: 1 genome_fasta_file: type: File - format: "http://edamontology.org/format_1929" label: "Genome type" - doc: "Reference genome FASTA file that includes all chromosomes" + doc: | + Genome type to be used for + generating reference genome + indices 'sd:upstreamSource': "genome_indices/fasta_output" 'sd:localLabel': true annotation_gtf_file: type: File - format: "http://edamontology.org/format_2306" - label: "Genome type" - doc: "GTF annotation file that includes refGene and mitochondrial DNA annotations" 'sd:upstreamSource': "genome_indices/annotation_gtf" - 'sd:localLabel': true - - threads: - type: int? - default: 4 - label: "Number of threads" - doc: "Number of threads for those steps that support multithreading" - 'sd:layout': - advanced: true memory_limit: type: int? default: 20 label: "Maximum memory used (GB)" - doc: "Maximum memory used (GB). The same will be applied to virtual memory" + doc: | + Maximum memory used (GB). The same + will be applied to virtual memory + 'sd:layout': + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + - "2" + - "3" + - "4" + - "5" + - "6" + default: "4" + label: "Cores/CPUs" + doc: | + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 4 'sd:layout': advanced: true @@ -114,7 +127,9 @@ steps: in: genome_fasta_file: genome_fasta_file annotation_gtf_file: annotation_gtf_file - threads: threads + threads: + source: threads + valueFrom: $(parseInt(self)) memory_limit: memory_limit output_folder_name: default: "cellranger_ref" @@ -180,7 +195,9 @@ steps: default: ["chrM"] # as recommended in Cell Ranger ARC manual output_folder_name: default: "cellranger_arc_ref" - threads: threads + threads: + source: threads + valueFrom: $(parseInt(self)) memory_limit: memory_limit out: - indices_folder diff --git a/workflows/cellranger-multi.cwl b/workflows/cellranger-multi.cwl index cf3c5b52..0ca5b3a3 100644 --- a/workflows/cellranger-multi.cwl +++ b/workflows/cellranger-multi.cwl @@ -26,19 +26,33 @@ inputs: gex_indices_folder: type: Directory - label: "Transcriptome reference" - doc: | - Path of folder containing 10x-compatible transcriptome reference. - Should be generated by "cellranger mkref" command + label: "Cell Ranger Reference Sample" + doc: | + Any "Cell Ranger Reference Sample" that + builds a reference genome package of a + selected species for quantifying gene + expression and chromatin accessibility. + This sample can be obtained from "Cell + Ranger Reference (RNA, ATAC, RNA+ATAC)" + pipeline. 'sd:upstreamSource': "gex_indices/indices_folder" 'sd:localLabel': true + memory_limit: + type: int? + default: 20 + "sd:upstreamSource": "gex_indices/memory_limit" + vdj_indices_folder: type: Directory - label: "V(D)J reference" - doc: | - Path of folder containing Cell Ranger V(D)J-compatible reference. - Should be generated by "cellranger mkvdjref" command + label: "Cell Ranger Reference VDJ Sample" + doc: | + Any "Cell Ranger Reference VDJ Sample" + that builds a reference genome of a + selected species for V(D)J contigs + assembly and clonotype calling. This + sample can be obtained from "Cell + Ranger Reference (VDJ)" pipeline. 'sd:upstreamSource': "vdj_indices/indices_folder" 'sd:localLabel': true @@ -103,26 +117,22 @@ inputs: - "null" - type: enum symbols: + - "1" - "2" - "3" - "4" - default: "2" - label: "Number of cores/cpus to use" - doc: "Number of cores/cpus for those steps that support multithreading" - 'sd:layout': + - "5" + - "6" + default: "4" + label: "Cores/CPUs" + doc: | + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 4 + "sd:layout": advanced: true - memory_limit: - type: int? - default: 20 - label: "Transcriptome reference" - doc: | - Maximum memory used (GB). - The same as was used for generating indices. - The same will be applied to virtual memory - 'sd:upstreamSource': "gex_indices/memory_limit" - 'sd:localLabel': true - outputs: @@ -405,13 +415,6 @@ outputs: Folder containing filtered data, i.e., only cell-associated barcodes. Used by cellranger aggr to aggregate samples for joint analysis. - compressed_html_data_folder: - type: File - outputSource: compress_html_data_folder/compressed_folder - label: "Compressed folder with CellBrowser formatted results" - doc: | - Compressed folder with CellBrowser formatted results - html_data_folder: type: Directory outputSource: cellbrowser_build/html_data @@ -616,13 +619,6 @@ steps: - html_data - index_html_file - compress_html_data_folder: - run: ../tools/tar-compress.cwl - in: - folder_to_compress: cellbrowser_build/html_data - out: - - compressed_folder - $namespaces: s: http://schema.org/ diff --git a/workflows/single-cell-preprocess-cellranger.cwl b/workflows/single-cell-preprocess-cellranger.cwl index 5ba70504..de0af624 100644 --- a/workflows/single-cell-preprocess-cellranger.cwl +++ b/workflows/single-cell-preprocess-cellranger.cwl @@ -9,7 +9,7 @@ requirements: - class: MultipleInputFeatureRequirement -'sd:upstream': +"sd:upstream": genome_indices: - "cellranger-mkref.cwl" @@ -18,268 +18,287 @@ inputs: alias: type: string - label: "Experiment short name/Alias" + label: "Analysis name" sd:preview: position: 1 indices_folder: type: Directory - label: "Genome Type" - doc: "Cell Ranger generated genome indices folder" - 'sd:upstreamSource': "genome_indices/indices_folder" - 'sd:localLabel': true + label: "Cell Ranger Reference Sample" + doc: | + Any "Cell Ranger Reference Sample" that + builds a reference genome package of a + selected species for quantifying gene + expression and chromatin accessibility. + This sample can be obtained from "Cell + Ranger Reference (RNA, ATAC, RNA+ATAC)" + pipeline. + "sd:upstreamSource": "genome_indices/indices_folder" + "sd:localLabel": true + + memory_limit: + type: int? + default: 20 + "sd:upstreamSource": "genome_indices/memory_limit" fastq_file_r1: type: - File - type: array items: File - label: "FASTQ file(s) R1 (optionally compressed)" - doc: "FASTQ file(s) R1 (optionally compressed)" + label: "RNA FASTQ, Read 1" + doc: | + Optionally compressed FASTQ file + with Read 1 (10x barcode and UMI) + single-cell RNA sequencing data. + If multiple files provided they + will be merged. fastq_file_r2: type: - File - type: array items: File - label: "FASTQ file(s) R2 (optionally compressed)" - doc: "FASTQ file(s) R2 (optionally compressed)" - - r1_length: - type: int? - default: null - label: "Limit the length of the input R1 sequence" - doc: "Limit the length of the input R1 sequence" - 'sd:layout': - advanced: true - - r2_length: - type: int? - default: null - label: "Limit the length of the input R2 sequence" - doc: "Limit the length of the input R2 sequence" - 'sd:layout': - advanced: true - - expect_cells: - type: int? - default: null - label: "Expected number of recovered cells. If not provided - use auto-estimated" - doc: "Expected number of recovered cells. If not provided - use auto-estimated" - 'sd:layout': - advanced: true - - force_cells: - type: int? - default: null - label: "Force pipeline to use this number of cells, bypassing the cell detection algorithm" - doc: "Force pipeline to use this number of cells, bypassing the cell detection algorithm" - 'sd:layout': - advanced: true + label: "RNA FASTQ, Read 2" + doc: | + Optionally compressed FASTQ file + with Read 2 (cDNA insert) single-cell + RNA sequencing data. If multiple + files provided they will be merged. exclude_introns: type: boolean? default: false - label: "Do not count intronic reads for whole transcriptome gene expression data" - doc: "Do not count intronic reads for whole transcriptome gene expression data" - 'sd:layout': - advanced: true - - no_bam: - type: boolean? - default: true - label: "Do not generate the BAM file" - doc: "Do not generate the BAM file" - 'sd:layout': + label: "Do not count intronic reads" + doc: | + Exclude intronic reads when counting + gene expression. In this mode, only + reads that are exonic and compatible + with annotated splice junctions in + the reference are counted. Using this + mode will reduce the UMI counts and + decrease sensitivity. + "sd:layout": advanced: true threads: - type: int? - default: 4 - label: "Number of threads" - doc: "Number of threads for those steps that support multithreading" - 'sd:layout': - advanced: true - - memory_limit: - type: int? - default: 30 - label: "Genome Type" + type: + - "null" + - type: enum + symbols: + - "1" + - "2" + - "3" + - "4" + - "5" + - "6" + default: "4" + label: "Cores/CPUs" doc: | - Maximum memory used (GB). - The same as was used for generating indices. - The same will be applied to virtual memory - 'sd:upstreamSource': "genome_indices/memory_limit" - 'sd:localLabel': true + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 4 + "sd:layout": + advanced: true outputs: - fastqc_report_fastq_r1: + web_summary_report: type: File - outputSource: run_fastqc_for_fastq_r1/html_file - label: "FastqQC report for FASTQ file R1" + outputSource: generate_counts_matrix/web_summary_report + label: "Cell Ranger Summary" doc: | - FastqQC report for FASTQ file R1 - 'sd:visualPlugins': + Report generated by Cell Ranger + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" - fastqc_report_fastq_r2: + cellbrowser_report: type: File - outputSource: run_fastqc_for_fastq_r2/html_file - label: "FastqQC report for FASTQ file R2" + outputSource: cellbrowser_build/index_html_file + label: "UCSC Cell Browser" doc: | - FastqQC report for FASTQ file R2 - 'sd:visualPlugins': + UCSC Cell Browser HTML index file + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" - web_summary_report: + fastqc_report_fastq_r1: type: File - outputSource: generate_counts_matrix/web_summary_report - label: "Cell Ranger summary" + outputSource: run_fastqc_for_fastq_r1/html_file + label: "QC report (RNA FASTQ, Read 1)" doc: | - Cell Ranger summary - 'sd:visualPlugins': + FastqQC report generated for + RNA FASTQ file, Read 1 + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" + target: "_blank" + + fastqc_report_fastq_r2: + type: File + outputSource: run_fastqc_for_fastq_r2/html_file + label: "QC report (RNA FASTQ, Read 2)" + doc: | + FastqQC report generated for + RNA FASTQ file, Read 2 + "sd:visualPlugins": + - linkList: + tab: "Overview" target: "_blank" metrics_summary_report: type: File outputSource: generate_counts_matrix/metrics_summary_report - label: "Run summary metrics in CSV format" + label: "Run summary metrics" doc: | - Run summary metrics in CSV format + Cell Ranger generated run summary + metrics in CSV format possorted_genome_bam_bai: - type: File? + type: File outputSource: generate_counts_matrix/possorted_genome_bam_bai - label: "Aligned to the genome indexed reads BAM+BAI files" + label: "RNA reads" doc: | - Indexed reads aligned to the genome and transcriptome annotated - with barcode information + Genome track of RNA reads aligned to + the reference genome. Each read has + a 10x Chromium cellular (associated + with a 10x Genomics gel bead) barcode + and molecular barcode information + attached. + "sd:visualPlugins": + - igvbrowser: + tab: "IGV Genome Browser" + id: "igvbrowser" + type: "alignment" + format: "bam" + name: "RNA reads" + displayMode: "SQUISHED" filtered_feature_bc_matrix_folder: type: File outputSource: compress_filtered_feature_bc_matrix_folder/compressed_folder - label: "Compressed folder with filtered feature-barcode matrices" + label: "Filtered feature barcode matrix, MEX" doc: | - Compressed folder with filtered feature-barcode matrices containing only cellular barcodes in MEX format. - When implemented, in Targeted Gene Expression samples, the non-targeted genes won't be present. + Filtered feature barcode matrix stored + as a CSC sparse matrix in MEX format. + The rows consist of the genes (identical + to raw feature barcode matrix) and the + columns are restricted to those barcodes + that are identified as cells. filtered_feature_bc_matrix_h5: type: File outputSource: generate_counts_matrix/filtered_feature_bc_matrix_h5 - label: "Filtered feature-barcode matrices in HDF5 format" + label: "Filtered feature barcode matrix, HDF5" doc: | - Filtered feature-barcode matrices containing only cellular barcodes in HDF5 format. - When implemented, in Targeted Gene Expression samples, the non-targeted genes won't - be present. + Filtered feature barcode matrix stored + as a CSC sparse matrix in hdf5 format. + The rows consist of the genes (identical + to raw feature barcode matrix) and the + columns are restricted to those barcodes + that are identified as cells. raw_feature_bc_matrices_folder: type: File outputSource: compress_raw_feature_bc_matrices_folder/compressed_folder - label: "Compressed folder with unfiltered feature-barcode matrices" + label: "Raw feature barcode matrix, MEX" doc: | - Compressed folder with unfiltered feature-barcode matrices containing all barcodes in MEX format + Raw feature barcode matrix stored as + a CSC sparse matrix in MEX format. + The rows consist of the genes and the + columns consist of all barcodes with + non-zero signal for gene expression. raw_feature_bc_matrices_h5: type: File outputSource: generate_counts_matrix/raw_feature_bc_matrices_h5 - label: "Unfiltered feature-barcode matrices in HDF5 format" + label: "Raw feature barcode matrix, HDF5" doc: | - Unfiltered feature-barcode matrices containing all barcodes in HDF5 format + Raw feature barcode matrix stored as + a CSC sparse matrix in hdf5 format. + The rows consist of the genes and the + columns consist of all barcodes with + non-zero signal for gene expression. secondary_analysis_report_folder: type: File outputSource: compress_secondary_analysis_report_folder/compressed_folder - label: "Compressed folder with secondary analysis results" + label: "Secondary analysis" doc: | - Compressed folder with secondary analysis results including dimensionality reduction, - cell clustering, and differential expression + Various secondary analyses results: + dimensionality reduction, clustering, + differential expression, etc. molecule_info_h5: type: File outputSource: generate_counts_matrix/molecule_info_h5 - label: "Molecule-level information for aggregating samples into larger datasets" + label: "RNA molecule-level data" doc: | - Molecule-level information used by cellranger aggr to aggregate samples into - larger datasets + Count and barcode information for + every RNA molecule observed in the + experiment in hdf5 format loupe_browser_track: + type: File outputSource: generate_counts_matrix/loupe_browser_track - label: "Loupe Browser visualization and analysis file" + label: "Loupe Browser visualization" + doc: | + Loupe Browser visualization file + with all the analysis outputs + + generate_counts_matrix_stdout_log: type: File + outputSource: generate_counts_matrix/stdout_log + label: "Output log, cellranger count step" doc: | - Loupe Browser visualization and analysis file + stdout log generated by cellranger count + + generate_counts_matrix_stderr_log: + type: File + outputSource: generate_counts_matrix/stderr_log + label: "Error log, cellranger count step" + doc: | + stderr log generated by cellranger count collected_statistics_yaml: type: File outputSource: collect_statistics/collected_statistics_yaml - label: "Collected statistics in YAML format" - doc: "Collected statistics in YAML format" + label: "Collected statistics, YAML" + doc: | + Collected statistics in YAML format collected_statistics_md: type: File outputSource: collect_statistics/collected_statistics_md - label: "Collected statistics in Markdown format" - doc: "Collected statistics in Markdown format" - 'sd:visualPlugins': + label: "Collected statistics" + doc: | + Collected statistics in Markdown format + "sd:visualPlugins": - markdownView: - tab: 'Overview' + tab: "Overview" collected_statistics_tsv: type: File outputSource: collect_statistics/collected_statistics_tsv - label: "Collected statistics in TSV format" - doc: "Collected statistics in TSV format" - 'sd:visualPlugins': + label: "Collected statistics" + doc: | + Collected statistics in TSV format + "sd:visualPlugins": - tableView: vertical: true - tab: 'Overview' - - generate_counts_matrix_stdout_log: - type: File - outputSource: generate_counts_matrix/stdout_log - label: stdout log generated by cellranger count - doc: | - stdout log generated by cellranger count - - generate_counts_matrix_stderr_log: - type: File - outputSource: generate_counts_matrix/stderr_log - label: stderr log generated by cellranger count - doc: | - stderr log generated by cellranger count - - compressed_html_data_folder: - type: File - outputSource: compress_html_data_folder/compressed_folder - label: "Compressed folder with CellBrowser formatted results" - doc: | - Compressed folder with CellBrowser formatted results + tab: "Overview" html_data_folder: type: Directory outputSource: cellbrowser_build/html_data - label: "Folder with not compressed CellBrowser formatted results" - doc: | - Folder with not compressed CellBrowser formatted results - - cellbrowser_report: - type: File - outputSource: cellbrowser_build/index_html_file - label: "CellBrowser formatted Cellranger report" + label: "UCSC Cell Browser data" doc: | - CellBrowser formatted Cellranger report - 'sd:visualPlugins': - - linkList: - tab: 'Overview' - target: "_blank" + Directory with UCSC Cell Browser data steps: @@ -306,7 +325,9 @@ steps: run: ../tools/fastqc.cwl in: reads_file: extract_fastq_r1/fastq_file - threads: threads + threads: + source: threads + valueFrom: $(parseInt(self)) out: - html_file @@ -314,7 +335,9 @@ steps: run: ../tools/fastqc.cwl in: reads_file: extract_fastq_r2/fastq_file - threads: threads + threads: + source: threads + valueFrom: $(parseInt(self)) out: - html_file @@ -324,13 +347,10 @@ steps: fastq_file_r1: extract_fastq_r1/fastq_file fastq_file_r2: extract_fastq_r2/fastq_file indices_folder: indices_folder - r1_length: r1_length - r2_length: r2_length - expect_cells: expect_cells - force_cells: force_cells - no_bam: no_bam exclude_introns: exclude_introns - threads: threads + threads: + source: threads + valueFrom: $(parseInt(self)) memory_limit: memory_limit virt_memory_limit: memory_limit out: @@ -386,13 +406,6 @@ steps: - html_data - index_html_file - compress_html_data_folder: - run: ../tools/tar-compress.cwl - in: - folder_to_compress: cellbrowser_build/html_data - out: - - compressed_folder - $namespaces: s: http://schema.org/ From fa6bbe73ac37d7150eca0cc7f304b8ad0c1d1fb6 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Sat, 27 Jan 2024 17:21:19 -0500 Subject: [PATCH 110/162] Put back some of the deprecated workflows that are not related to sc --- workflows/chipseq-pe.cwl | 841 +++++++++++++++++++++ workflows/chipseq-se.cwl | 717 ++++++++++++++++++ workflows/rnaseq-pe-dutp-mitochondrial.cwl | 634 ++++++++++++++++ workflows/rnaseq-pe-dutp.cwl | 591 +++++++++++++++ workflows/rnaseq-pe.cwl | 544 +++++++++++++ workflows/rnaseq-se-dutp-mitochondrial.cwl | 574 ++++++++++++++ workflows/rnaseq-se-dutp.cwl | 527 +++++++++++++ workflows/rnaseq-se.cwl | 480 ++++++++++++ 8 files changed, 4908 insertions(+) create mode 100644 workflows/chipseq-pe.cwl create mode 100644 workflows/chipseq-se.cwl create mode 100644 workflows/rnaseq-pe-dutp-mitochondrial.cwl create mode 100644 workflows/rnaseq-pe-dutp.cwl create mode 100644 workflows/rnaseq-pe.cwl create mode 100644 workflows/rnaseq-se-dutp-mitochondrial.cwl create mode 100644 workflows/rnaseq-se-dutp.cwl create mode 100644 workflows/rnaseq-se.cwl diff --git a/workflows/chipseq-pe.cwl b/workflows/chipseq-pe.cwl new file mode 100644 index 00000000..2aa5568f --- /dev/null +++ b/workflows/chipseq-pe.cwl @@ -0,0 +1,841 @@ +cwlVersion: v1.0 +class: Workflow + +requirements: +- class: SubworkflowFeatureRequirement +- class: ScatterFeatureRequirement +- class: StepInputExpressionRequirement +- class: MultipleInputFeatureRequirement +- class: InlineJavascriptRequirement + expressionLib: + - var get_root = function(basename) { + return basename.split('.').slice(0,1).join('.'); + }; + +'sd:metadata': + - "../metadata/chipseq-header.cwl" + +'sd:upstream': + genome_indices: "genome-indices.cwl" + control_file: "chipseq-pe.cwl" + + +inputs: + + indices_folder: + type: Directory + 'sd:upstreamSource': "genome_indices/bowtie_indices" + label: "Indexed genome folder (bowtie)" + doc: "Path to indexed genome folder by **bowtie**" + + annotation_file: + type: File + 'sd:upstreamSource': "genome_indices/annotation" + label: "Annotation file" + format: "http://edamontology.org/format_3475" + doc: "Tab-separated annotation file" + + genome_size: + type: string + 'sd:upstreamSource': "genome_indices/genome_size" + label: "Effective genome size" + doc: "MACS2 effective genome size: hs, mm, ce, dm or number, for example 2.7e9" + + chrom_length: + type: File + 'sd:upstreamSource': "genome_indices/chrom_length" + label: "Chromosomes length file" + format: "http://edamontology.org/format_2330" + doc: "Chromosomes length file" + + control_file: + type: File? + default: null + 'sd:upstreamSource': "control_file/bambai_pair" + 'sd:localLabel': true + label: "Use experiment as a control" + format: "http://edamontology.org/format_2572" + doc: "Use experiment as a control for MACS2 peak calling" + + broad_peak: + type: boolean? + default: False + label: "Callpeak broad" + doc: "Set to call broad peak for MACS2" + + fastq_file_upstream: + type: + - File + - type: array + items: File + label: "FASTQ 1 input file(s)" + format: "http://edamontology.org/format_1930" + doc: "Reads data in a FASTQ format, received after paired end sequencing" + + fastq_file_downstream: + type: + - File + - type: array + items: File + label: "FASTQ 2 input file(s)" + format: "http://edamontology.org/format_1930" + doc: "Reads data in a FASTQ format, received after paired end sequencing" + + exp_fragment_size: + type: int? + default: 150 + 'sd:layout': + advanced: true + label: "Expected fragment size" + doc: "Expected fragment size for MACS2" + + force_fragment_size: + type: boolean? + default: false + 'sd:layout': + advanced: true + label: "Force fragment size" + doc: "Force MACS2 to use exp_fragment_size" + + clip_3p_end: + type: int? + default: 0 + 'sd:layout': + advanced: true + label: "Clip from 3p end" + doc: "Number of bases to clip from the 3p end" + + clip_5p_end: + type: int? + default: 0 + 'sd:layout': + advanced: true + label: "Clip from 5p end" + doc: "Number of bases to clip from the 5p end" + + remove_duplicates: + type: boolean? + default: false + 'sd:layout': + advanced: true + label: "Remove duplicates" + doc: "Calls samtools rmdup to remove duplicates from sortesd BAM file" + + peak_calling_fdr: + type: float? + default: 0.05 + 'sd:layout': + advanced: true + label: "Minimum FDR (q-value) cutoff for peak detection" + doc: | + Minimum FDR (q-value) cutoff for peak detection. -q, and + -p are mutually exclusive. + + promoter_dist: + type: int? + default: 1000 + 'sd:layout': + advanced: true + label: "Max distance from gene TSS (in both direction) overlapping which the peak will be assigned to the promoter region" + doc: "Max distance from gene TSS (in both direction) overlapping which the peak will be assigned to the promoter region" + + upstream_dist: + type: int? + default: 20000 + 'sd:layout': + advanced: true + label: "Max distance from the promoter (only in upstream direction) overlapping which the peak will be assigned to the upstream region" + doc: "Max distance from the promoter (only in upstream direction) overlapping which the peak will be assigned to the upstream region" + + threads: + type: int? + default: 2 + 'sd:layout': + advanced: true + doc: "Number of threads for those steps that support multithreading" + label: "Number of threads" + +outputs: + + unaligned_fastq: + type: + - "null" + - File[] + format: "http://edamontology.org/format_1930" + label: "Unaligned FASTQ file(s)" + doc: "Unaligned FASTQ file(s)" + outputSource: bowtie_aligner/unaligned_fastq + + multimapped_fastq: + type: + - "null" + - File[] + format: "http://edamontology.org/format_1930" + label: "Multimapped FASTQ file(s)" + doc: "Multimapped FASTQ file(s)" + outputSource: bowtie_aligner/multimapped_fastq + + bigwig: + type: File + format: "http://edamontology.org/format_3006" + label: "BigWig file" + doc: "Generated BigWig file" + outputSource: bam_to_bigwig/bigwig_file + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + type: 'wig' + name: "BigWig Track" + height: 120 + + fastx_statistics_upstream: + type: File + label: "FASTQ 1 statistics" + format: "http://edamontology.org/format_2330" + doc: "fastx_quality_stats generated FASTQ 1 quality statistics file" + outputSource: fastx_quality_stats_upstream/statistics_file + 'sd:visualPlugins': + - line: + tab: 'QC Plots' + Title: 'FASTQ 1 Base frequency plot' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Frequency' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$13, $14, $15, $16, $17] + - boxplot: + tab: 'QC Plots' + Title: 'FASTQ 1 Quality Control' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Quality score' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$11, $7, $8, $9, $12] + + fastx_statistics_downstream: + type: File + label: "FASTQ 2 statistics" + format: "http://edamontology.org/format_2330" + doc: "fastx_quality_stats generated FASTQ 2 quality statistics file" + outputSource: fastx_quality_stats_downstream/statistics_file + 'sd:visualPlugins': + - line: + tab: 'QC Plots' + Title: 'FASTQ 2 Base frequency plot' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Frequency' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$13, $14, $15, $16, $17] + - boxplot: + tab: 'QC Plots' + Title: 'FASTQ 2 Quality Control' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Quality score' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$11, $7, $8, $9, $12] + + bowtie_log: + type: File + label: "BOWTIE alignment log" + format: "http://edamontology.org/format_2330" + doc: "BOWTIE generated alignment log" + outputSource: bowtie_aligner/log_file + + iaintersect_log: + type: File + label: "Island intersect log" + format: "http://edamontology.org/format_3475" + doc: "Iaintersect generated log" + outputSource: island_intersect/log_file + + iaintersect_result: + type: File + label: "Island intersect results" + format: "http://edamontology.org/format_3475" + doc: "Iaintersect generated results" + outputSource: island_intersect/result_file + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'Peak Calling' + Title: 'Islands list' + + atdp_log: + type: File + label: "ATDP log" + format: "http://edamontology.org/format_3475" + doc: "Average Tag Density generated log" + outputSource: average_tag_density/log_file + + atdp_result: + type: File + label: "ATDP results" + format: "http://edamontology.org/format_3475" + doc: "Average Tag Density generated results" + outputSource: average_tag_density/result_file + 'sd:visualPlugins': + - scatter: + tab: 'QC Plots' + Title: 'Average Tag Density' + xAxisTitle: 'Distance From TSS (bases)' + yAxisTitle: 'Average Tag Density (per bp)' + colors: ["#b3de69"] + height: 500 + data: [$1, $2] + comparable: "atdp" + + bambai_pair: + type: File + format: "http://edamontology.org/format_2572" + label: "Coordinate sorted BAM alignment file (+index BAI)" + doc: "Coordinate sorted BAM file and BAI index file" + outputSource: samtools_remove_duplicates/deduplicated_bam_bai_pair + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + optional: true + type: 'alignment' + format: 'bam' + name: "BAM Track" + displayMode: "SQUISHED" + + macs2_called_peaks: + type: File? + label: "Called peaks" + format: "http://edamontology.org/format_3468" + doc: "XLS file to include information about called peaks" + outputSource: macs2_callpeak/peak_xls_file + + macs2_narrow_peaks: + type: File? + label: "Narrow peaks" + format: "http://edamontology.org/format_3613" + doc: "Contains the peak locations together with peak summit, pvalue and qvalue" + outputSource: macs2_callpeak/narrow_peak_file + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + type: 'annotation' + name: "Narrow peaks" + displayMode: "COLLAPSE" + height: 40 + + macs2_broad_peaks: + type: File? + label: "Broad peaks" + format: "http://edamontology.org/format_3614" + doc: "Contains the peak locations together with peak summit, pvalue and qvalue" + outputSource: macs2_callpeak/broad_peak_file + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + type: 'annotation' + name: "Broad peaks" + displayMode: "COLLAPSE" + height: 40 + + macs2_peak_summits: + type: File? + label: "Peak summits" + format: "http://edamontology.org/format_3003" + doc: "Contains the peak summits locations for every peaks" + outputSource: macs2_callpeak/peak_summits_file + + macs2_moder_r: + type: File? + label: "MACS2 generated R script" + format: "http://edamontology.org/format_2330" + doc: "R script to produce a PDF image about the model based on your data" + outputSource: macs2_callpeak/moder_r_file + + macs2_gapped_peak: + type: File? + label: "Gapped peaks" + format: "http://edamontology.org/format_3586" + doc: "Contains both the broad region and narrow peaks" + outputSource: macs2_callpeak/gapped_peak_file + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + type: 'annotation' + name: "Gapped peaks" + displayMode: "COLLAPSE" + height: 40 + + macs2_log: + type: File? + label: "MACS2 log" + format: "http://edamontology.org/format_2330" + doc: "MACS2 output log" + outputSource: macs2_callpeak/macs_log + + get_stat_log: + type: File? + label: "YAML formatted combined log" + format: "http://edamontology.org/format_3750" + doc: "YAML formatted combined log" + outputSource: get_stat/collected_statistics_yaml + + get_stat_markdown: + type: File? + label: "Markdown formatted combined log" + format: "http://edamontology.org/format_3835" + doc: "Markdown formatted combined log" + outputSource: get_stat/collected_statistics_md + 'sd:visualPlugins': + - markdownView: + tab: 'Overview' + + get_stat_formatted_log: + type: File? + label: "Bowtie & Samtools Rmdup combined formatted log" + format: "http://edamontology.org/format_3475" + doc: "Processed and combined Bowtie aligner and Samtools rmdup formatted log" + outputSource: get_stat/collected_statistics_tsv + 'sd:visualPlugins': + - tableView: + vertical: true + tab: 'Overview' + 'sd:preview': + 'sd:visualPlugins': + - pie: + colors: ['#b3de69', '#99c0db', '#fb8072', '#fdc381'] + data: [$2, $3, $4, $5] + + bam_statistics_report: + type: File + label: "BAM statistics report (original)" + format: "http://edamontology.org/format_2330" + doc: "BAM statistics report (right after alignment and sorting)" + outputSource: get_bam_statistics/log_file + + bam_statistics_report_after_filtering: + type: File + label: "BAM statistics report (after filtering)" + format: "http://edamontology.org/format_2330" + doc: "BAM statistics report (after all filters applied)" + outputSource: get_bam_statistics_after_filtering/log_file + + insert_size_report_after_filtering: + type: File + label: "Insert size distribution report (after filtering)" + format: "http://edamontology.org/format_3475" + doc: "Insert size distribution report (after all filters applied)" + outputSource: get_bam_statistics_after_filtering/ext_is_section + 'sd:visualPlugins': + - scatter: + tab: 'QC Plots' + Title: 'Insert Size Distribution (after filtering)' + xAxisTitle: 'Insert size' + yAxisTitle: 'Pairs total' + colors: ["#4b78a3"] + height: 500 + data: [$1, $2] + comparable: "isdp" + + macs2_fragment_stat: + type: File? + label: "FRAGMENT, FRAGMENTE, ISLANDS" + format: "http://edamontology.org/format_2330" + doc: "fragment, calculated fragment, islands count from MACS2 results" + outputSource: macs2_callpeak/macs2_stat_file + + preseq_estimates_plot_data: + type: File? + label: "Preseq estimates" + format: "http://edamontology.org/format_3475" + doc: "Preseq estimated results" + outputSource: preseq_plot_data/estimates_file_plot_data + 'sd:visualPlugins': + - line: + tab: 'QC Plots' + Title: 'Distinct Read Counts Estimates' + xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' + yAxisTitle: 'Distinct Reads Count' + colors: ["#4b78a3", "#a3514b"] + height: 500 + data: [$2, $5] + + estimated_fragment_size: + type: int + label: "Estimated fragment size" + doc: "Estimated fragment size for downstream analyses" + outputSource: macs2_callpeak/macs2_fragments_calculated + + mapped_reads_number: + type: int + label: "Mapped reads number" + doc: "Mapped reads number for downstream analyses" + outputSource: get_stat/mapped_reads + + +steps: + + extract_fastq_upstream: + label: "Loading unmapped sequence data for read 1" + doc: | + Most DNA cores and commercial NGS companies return unmapped sequence data in FASTQ format. + The data can be uploaded from users computer, downloaded directly from an ftp server of + the core facility by providing a URL or from GEO by providing SRA accession number. + run: ../tools/extract-fastq.cwl + in: + compressed_file: fastq_file_upstream + output_prefix: + default: "read_1" + out: [fastq_file] + + extract_fastq_downstream: + label: "Loading unmapped sequence data for read 2" + doc: | + Most DNA cores and commercial NGS companies return unmapped sequence data in FASTQ format. + The data can be uploaded from users computer, downloaded directly from an ftp server of + the core facility by providing a URL or from GEO by providing SRA accession number. + run: ../tools/extract-fastq.cwl + in: + compressed_file: fastq_file_downstream + output_prefix: + default: "read_2" + out: [fastq_file] + + fastx_quality_stats_upstream: + label: "Quality control of unmapped sequence data for read 1" + doc: | + Evaluates the quality of your sequence data. Provides per base quality scores as well as + base frequencies along the reads. These metrics can be used to identify whether your data + has any problems that should be taken into account in the subsequent analysis steps. + run: ../tools/fastx-quality-stats.cwl + in: + input_file: extract_fastq_upstream/fastq_file + out: [statistics_file] + + fastx_quality_stats_downstream: + label: "Quality control of unmapped sequence data for read 2" + doc: | + Evaluates the quality of your sequence data. Provides per base quality scores as well as + base frequencies along the reads. These metrics can be used to identify whether your data + has any problems that should be taken into account in the subsequent analysis steps. + run: ../tools/fastx-quality-stats.cwl + in: + input_file: extract_fastq_downstream/fastq_file + out: [statistics_file] + + bowtie_aligner: + label: "Alignment to reference genome" + doc: | + Aligns reads to the reference genome. + Reads are assumed to be mapped if they + have less than 3 mismatches. + sam_file output includes both mapped + and unmapped reads. + run: ../tools/bowtie-alignreads.cwl + in: + upstream_filelist: extract_fastq_upstream/fastq_file + downstream_filelist: extract_fastq_downstream/fastq_file + indices_folder: indices_folder + clip_3p_end: clip_3p_end + clip_5p_end: clip_5p_end + v: + default: 3 + m: + default: 1 + best: + default: true + strata: + default: true + sam: + default: true + unaligned_prefix: + default: "unaligned_reads" + multimapped_prefix: + default: "multimapped_reads" + threads: threads + q: + default: true + X: + default: 500 + out: [sam_file, log_file, unaligned_fastq, multimapped_fastq] + + samtools_sort_index: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: bowtie_aligner/sam_file + threads: threads + out: [bam_bai_pair] + + samtools_mark_duplicates: + run: ../tools/samtools-markdup.cwl + in: + bam_bai_pair: samtools_sort_index/bam_bai_pair + keep_duplicates: + default: true + threads: threads + out: [deduplicated_bam_bai_pair] + + clean_sam_headers_for_preseq: + run: ../tools/samtools-clean-headers.cwl + in: + bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair + out: [preseq_bam] + + preseq: + label: "Sequencing depth estimation" + doc: | + Estimates the complexity of the sequencing library, evaluates how many reads can + be expected from the additional sequencing of the same experiment. + run: ../tools/preseq-lc-extrap.cwl + in: + bam_file: clean_sam_headers_for_preseq/preseq_bam + pe_mode: + default: true + extrapolation: + default: 1000000000 + out: [estimates_file, log_file_stdout, log_file_stderr] + + samtools_remove_duplicates: + run: ../tools/samtools-markdup.cwl + in: + bam_bai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair + keep_duplicates: + source: remove_duplicates + valueFrom: $(!self) + threads: threads + out: [deduplicated_bam_bai_pair] + + macs2_callpeak: + label: "Peak detection" + doc: | + Identifies enriched with aligned reads genome areas. Those areas correspond to the + transcription factor binding sites. + run: ../tools/macs2-callpeak-biowardrobe-only.cwl + in: + treatment_file: samtools_remove_duplicates/deduplicated_bam_bai_pair + control_file: control_file + nolambda: + source: control_file + valueFrom: $(!self) + genome_size: genome_size + mfold: + default: "4 40" + verbose: + default: 3 + nomodel: force_fragment_size + extsize: exp_fragment_size + bw: exp_fragment_size + broad: broad_peak + call_summits: + source: broad_peak + valueFrom: $(!self) + keep_dup: + default: auto + q_value: peak_calling_fdr + format_mode: + default: BAMPE + buffer_size: + default: 10000 + out: + - peak_xls_file + - narrow_peak_file + - peak_summits_file + - broad_peak_file + - moder_r_file + - gapped_peak_file + - treat_pileup_bdg_file + - control_lambda_bdg_file + - macs_log + - macs2_stat_file + - macs2_fragments_calculated + + bam_to_bigwig: + run: ../tools/bam-bedgraph-bigwig.cwl + in: + bam_file: samtools_remove_duplicates/deduplicated_bam_bai_pair + chrom_length_file: chrom_length + mapped_reads_number: get_stat/mapped_reads + pairchip: + default: true + out: [bigwig_file] + + get_bam_statistics: + label: "Quality control of aligned sequence data" + doc: | + Calculates alignment statistics, such as reads mapped/unmapped, average + read length and quality score, etc. + run: ../tools/samtools-stats.cwl + in: + bambai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair + output_filename: + source: samtools_mark_duplicates/deduplicated_bam_bai_pair + valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") + out: [log_file] + + get_bam_statistics_after_filtering: + run: ../tools/samtools-stats.cwl + in: + bambai_pair: samtools_remove_duplicates/deduplicated_bam_bai_pair + output_filename: + source: samtools_remove_duplicates/deduplicated_bam_bai_pair + valueFrom: $(get_root(self.basename)+"_bam_statistics_report_after_filtering.txt") + out: [log_file, ext_is_section, reads_mapped] + + get_stat: + run: ../tools/collect-statistics-chip-seq.cwl + in: + bowtie_alignment_report: bowtie_aligner/log_file + bam_statistics_report: get_bam_statistics/log_file + bam_statistics_after_filtering_report: get_bam_statistics_after_filtering/log_file + macs2_called_peaks: macs2_callpeak/peak_xls_file + atdp_results: average_tag_density/result_file + preseq_results: preseq/estimates_file + paired_end: + default: True + out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] + + preseq_plot_data: + label: "Formats sequencing depth estimation data for plotting" + doc: | + Formats estimates file from preseq standard output for QC plotting. This adds a new + column that includes the actual read count point on the plot. + run: ../tools/preseq-plot-data.cwl + in: + preseq_stderr_log_file: preseq/log_file_stderr + estimates_file: preseq/estimates_file + mapped_reads: get_stat/mapped_reads + out: [estimates_file_plot_data] + + island_intersect: + label: "Peak annotation" + doc: | + Assigns nearest genes to peaks to explore the biological implication of the open + chromatin binding sites. + run: ../tools/iaintersect.cwl + in: + input_filename: macs2_callpeak/peak_xls_file + annotation_filename: annotation_file + promoter_bp: promoter_dist + upstream_bp: upstream_dist + out: [result_file, log_file] + + samtools_sort_index_for_atdp: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: samtools_remove_duplicates/deduplicated_bam_bai_pair + threads: threads + out: [bam_bai_pair] + + average_tag_density: + label: "Read enrichment around genes TSS" + doc: | + Generates average tag density plot around genes TSS as a lot of cis-regulatory + elements are close to the TSS of their targets. + run: ../tools/atdp.cwl + in: + input_file: samtools_sort_index_for_atdp/bam_bai_pair + annotation_filename: annotation_file + fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated + avd_window_bp: + default: 5000 + avd_smooth_bp: + default: 50 + ignore_chr: + default: chrM + double_chr: + default: "chrX chrY" + avd_heat_window_bp: + default: 200 + mapped_reads: + source: get_bam_statistics_after_filtering/reads_mapped + valueFrom: $(parseInt(self/2)) + out: [result_file, log_file] + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Deprecated. ChIP-Seq pipeline paired-end" +s:name: "Deprecated. ChIP-Seq pipeline paired-end" +s:alternateName: "ChIP-Seq basic analysis workflow for a paired-end experiment" + +s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/chipseq-pe.cwl +s:codeRepository: https://github.com/datirium/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:michael.kotliar@cchmc.org + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + +# doc: +# $include: ../descriptions/chipseq-pe.md + + +doc: | + The original [BioWardrobe's](https://biowardrobe.com) [PubMed ID:26248465](https://www.ncbi.nlm.nih.gov/pubmed/26248465) + **ChIP-Seq** basic analysis workflow for a **paired-end** experiment. + A [FASTQ](http://maq.sourceforge.net/fastq.shtml) input file has to be provided. + + The pipeline produces a sorted BAM file alongside with index BAI file, quality + statistics of the input FASTQ file, coverage by estimated fragments as a BigWig file, peaks calling + data in a form of narrowPeak or broadPeak files, islands with the assigned nearest genes and + region type, data for average tag density plot. + + Workflow starts with step *fastx\_quality\_stats* from FASTX-Toolkit + to calculate quality statistics for input FASTQ file. + + At the same time `bowtie` is used to align + reads from input FASTQ file to reference genome *bowtie\_aligner*. The output of this step + is an unsorted SAM file which is being sorted and indexed by `samtools sort` and `samtools index` + *samtools\_sort\_index*. + + Depending on workflow’s input parameters indexed and sorted BAM file + can be processed by `samtools markdup` *samtools\_remove\_duplicates* to get rid of duplicated reads. + + Next `macs2 callpeak` performs peak calling *macs2\_callpeak* and the next step + reports *macs2\_island\_count* the number of islands and estimated fragment size. If the latter + is less that 80bp (hardcoded in the workflow) `macs2 callpeak` is rerun again with forced fixed + fragment size value (*macs2\_callpeak\_forced*). It is also possible to force MACS2 to use pre set fragment size in the first place. + + Next step (*macs2\_stat*) is used to define which of the islands and estimated fragment size should be used + in workflow output: either from *macs2\_island\_count* step or from *macs2\_island\_count\_forced* step. If input + trigger of this step is set to True it means that *macs2\_callpeak\_forced* step was run and it returned different + from *macs2\_callpeak* step results, so *macs2\_stat* step should return [fragments\_new, fragments\_old, islands\_new], + if trigger is False the step returns [fragments\_old, fragments\_old, islands\_old], where sufix "old" defines + results obtained from *macs2\_island\_count* step and sufix "new" - from *macs2\_island\_count\_forced* step. + + The following two steps (*bamtools\_stats* and *bam\_to\_bigwig*) are used to calculate coverage from BAM file and save it in BigWig format. For that purpose bamtools stats returns the number of + mapped reads which is then used as scaling factor by bedtools genomecov when it performs coverage + calculation and saves it as a BEDgraph file whichis then sorted and converted to BigWig format by + bedGraphToBigWig tool from UCSC utilities. Step *get\_stat* is used to return a text file with statistics + in a form of [TOTAL, ALIGNED, SUPRESSED, USED] reads count. + + Step *island\_intersect* assigns nearest genes and regions to the islands obtained from *macs2\_callpeak\_forced*. + Step *average\_tag\_density* is used to calculate data for average tag density plot from the BAM file. \ No newline at end of file diff --git a/workflows/chipseq-se.cwl b/workflows/chipseq-se.cwl new file mode 100644 index 00000000..d6f83eb5 --- /dev/null +++ b/workflows/chipseq-se.cwl @@ -0,0 +1,717 @@ +cwlVersion: v1.0 +class: Workflow + +requirements: + - class: SubworkflowFeatureRequirement + - class: ScatterFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var get_root = function(basename) { + return basename.split('.').slice(0,1).join('.'); + }; + + +'sd:metadata': + - "../metadata/chipseq-header.cwl" + +'sd:upstream': + genome_indices: "genome-indices.cwl" + control_file: "chipseq-se.cwl" + + +inputs: + + indices_folder: + type: Directory + 'sd:upstreamSource': "genome_indices/bowtie_indices" + label: "Genome indices" + doc: "Directory with the genome indices generated by Bowtie" + + annotation_file: + type: File + 'sd:upstreamSource': "genome_indices/annotation" + label: "Genome annotation file" + format: "http://edamontology.org/format_3475" + doc: "Genome annotation file in TSV format" + + genome_size: + type: string + 'sd:upstreamSource': "genome_indices/genome_size" + label: "Effective genome size" + doc: "The length of the mappable genome (hs, mm, ce, dm or number, for example 2.7e9)" + + chrom_length: + type: File + 'sd:upstreamSource': "genome_indices/chrom_length" + label: "Chromosome lengths file" + format: "http://edamontology.org/format_2330" + doc: "Chromosome lengths file in TSV format" + + control_file: + type: File? + default: null + 'sd:upstreamSource': "control_file/bambai_pair" + 'sd:localLabel': true + label: "Control ChIP-Seq single-read experiment" + format: "http://edamontology.org/format_2572" + doc: "Indexed BAM file from the ChIP-Seq single-read experiment to be used as a control for MACS2 peak calling" + + broad_peak: + type: boolean? + default: False + # 'sd:parent': "https://raw.githubusercontent.com/datirium/workflows/master/tags/antibody-dummy.cwl" + label: "Call broad peaks" + doc: "Make MACS2 call broad peaks by linking nearby highly enriched regions" + + fastq_file: + type: + - File + - type: array + items: File + label: "FASTQ file(s)" + format: "http://edamontology.org/format_1930" + doc: "Single-read sequencing data in FASTQ format (fastq, fq, bzip2, gzip, zip)" + + exp_fragment_size: + type: int? + default: 150 + 'sd:layout': + advanced: true + label: "Expected fragment size" + doc: "Expected fragment size for read extenstion towards 3' end if force_fragment_size was set to True or if calculated by MACS2 fragment size was less that 80 bp" + + force_fragment_size: + type: boolean? + default: false + 'sd:layout': + advanced: true + label: "Force peak calling with expected fragment size" + doc: "Make MACS2 don't build the shifting model and use expected fragment size for read extenstion towards 3' end" + + clip_3p_end: + type: int? + default: 0 + 'sd:layout': + advanced: true + label: "Clip from 3' end" + doc: "Number of base pairs to clip from 3' end" + + clip_5p_end: + type: int? + default: 0 + 'sd:layout': + advanced: true + label: "Clip from 5' end" + doc: "Number of base pairs to clip from 5' end" + + remove_duplicates: + type: boolean? + default: false + 'sd:layout': + advanced: true + label: "Remove PCR duplicates" + doc: "Remove PCR duplicates from sorted BAM file" + + peak_calling_fdr: + type: float? + default: 0.05 + 'sd:layout': + advanced: true + label: "Minimum FDR (q-value) cutoff for peak detection" + doc: | + Minimum FDR (q-value) cutoff for peak detection. -q, and + -p are mutually exclusive. + + promoter_dist: + type: int? + default: 1000 + 'sd:layout': + advanced: true + label: "Max distance from gene TSS (in both direction) overlapping which the peak will be assigned to the promoter region" + doc: "Max distance from gene TSS (in both direction) overlapping which the peak will be assigned to the promoter region" + + upstream_dist: + type: int? + default: 20000 + 'sd:layout': + advanced: true + label: "Max distance from the promoter (only in upstream direction) overlapping which the peak will be assigned to the upstream region" + doc: "Max distance from the promoter (only in upstream direction) overlapping which the peak will be assigned to the upstream region" + + threads: + type: int? + default: 2 + 'sd:layout': + advanced: true + label: "Number of threads" + doc: "Number of threads for those steps that support multithreading" + + +outputs: + + unaligned_fastq: + type: + - "null" + - File[] + format: "http://edamontology.org/format_1930" + label: "Unaligned FASTQ file(s)" + doc: "Unaligned FASTQ file(s)" + outputSource: bowtie_aligner/unaligned_fastq + + multimapped_fastq: + type: + - "null" + - File[] + format: "http://edamontology.org/format_1930" + label: "Multimapped FASTQ file(s)" + doc: "Multimapped FASTQ file(s)" + outputSource: bowtie_aligner/multimapped_fastq + + bigwig: + type: File + format: "http://edamontology.org/format_3006" + label: "Genome coverage" + doc: "Genome coverage in bigWig format" + outputSource: bam_to_bigwig/bigwig_file + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + optional: true + type: 'wig' + name: "Genome Coverage" + height: 120 + + fastx_statistics: + type: File + label: "FASTQ quality statistics" + format: "http://edamontology.org/format_2330" + doc: "FASTQ quality statistics in TSV format" + outputSource: fastx_quality_stats/statistics_file + 'sd:visualPlugins': + - line: + tab: 'QC Plots' + Title: 'Base Frequency Plot' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Frequency' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$13, $14, $15, $16, $17] + - boxplot: + tab: 'QC Plots' + Title: 'Base Quality Plot' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Quality score' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$11, $7, $8, $9, $12] + + bowtie_log: + type: File + label: "Read alignment log" + format: "http://edamontology.org/format_2330" + doc: "Read alignment log file from Bowtie" + outputSource: bowtie_aligner/log_file + + iaintersect_result: + type: File + label: "Gene annotated peaks" + format: "http://edamontology.org/format_3475" + doc: "MACS2 peak file annotated with nearby genes" + outputSource: island_intersect/result_file + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'Peak Calling' + Title: 'Peak Coordinates' + + atdp_result: + type: File + label: "Average Tag Density Plot" + format: "http://edamontology.org/format_3475" + doc: "Average Tag Density Plot file in TSV format" + outputSource: average_tag_density/result_file + 'sd:visualPlugins': + - scatter: + tab: 'QC Plots' + Title: 'Average Tag Density Plot' + xAxisTitle: 'Distance From TSS (bp)' + yAxisTitle: 'Average Tag Density (per bp)' + colors: ["#b3de69"] + height: 500 + data: [$1, $2] + comparable: "atdp" + + bambai_pair: + type: File + format: "http://edamontology.org/format_2572" + label: "Aligned reads" + doc: "Coordinate sorted BAM alignment and index BAI files" + outputSource: samtools_remove_duplicates/deduplicated_bam_bai_pair + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + type: 'alignment' + format: 'bam' + name: "Nucleotide Sequence Alignments" + displayMode: "SQUISHED" + + macs2_called_peaks: + type: File + label: "Called peaks" + format: "http://edamontology.org/format_3468" + doc: "Called peaks file with 1-based coordinates in XLS format" + outputSource: macs2_callpeak/peak_xls_file + + macs2_narrow_peaks: + type: File? + label: "Narrow peaks" + format: "http://edamontology.org/format_3613" + doc: "Called peaks file in ENCODE narrow peak format" + outputSource: macs2_callpeak/narrow_peak_file + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + type: 'annotation' + name: "Narrow peaks" + displayMode: "COLLAPSE" + height: 40 + + macs2_broad_peaks: + type: File? + label: "Broad peaks" + format: "http://edamontology.org/format_3614" + doc: "Called peaks file in ENCODE broad peak format" + outputSource: macs2_callpeak/broad_peak_file + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + type: 'annotation' + name: "Broad peaks" + displayMode: "COLLAPSE" + height: 40 + + workflow_statistics_yaml: + type: File? + label: "YAML formatted combined log" + format: "http://edamontology.org/format_3750" + doc: "YAML formatted combined log" + outputSource: get_stat/collected_statistics_yaml + + workflow_statistics_markdown: + type: File? + label: "Markdown formatted combined log" + format: "http://edamontology.org/format_3835" + doc: "Markdown formatted combined log" + outputSource: get_stat/collected_statistics_md + 'sd:visualPlugins': + - markdownView: + tab: 'Overview' + + workflow_statistics_tsv: + type: File + label: "Workflow execution statistics" + format: "http://edamontology.org/format_3475" + doc: "Overall workflow execution statistics from bowtie_aligner and samtools_rmdup steps" + outputSource: get_stat/collected_statistics_tsv + 'sd:visualPlugins': + - tableView: + vertical: true + tab: 'Overview' + 'sd:preview': + 'sd:visualPlugins': + - pie: + colors: ['#b3de69', '#99c0db', '#fb8072', '#fdc381'] + data: [$2, $3, $4, $5] + + bam_statistics_report: + type: File + label: "BAM statistics report (original)" + format: "http://edamontology.org/format_2330" + doc: "BAM statistics report (right after alignment and sorting)" + outputSource: get_bam_statistics/log_file + + bam_statistics_report_after_filtering: + type: File + label: "BAM statistics report (after filtering)" + format: "http://edamontology.org/format_2330" + doc: "BAM statistics report (after all filters applied)" + outputSource: get_bam_statistics_after_filtering/log_file + + preseq_estimates_plot_data: + type: File? + label: "Preseq estimates" + format: "http://edamontology.org/format_3475" + doc: "Preseq estimated results" + outputSource: preseq_plot_data/estimates_file_plot_data + 'sd:visualPlugins': + - line: + tab: 'QC Plots' + Title: 'Distinct Read Counts Estimates' + xAxisTitle: 'Mapped Reads/Fragments/Tags (millions)' + yAxisTitle: 'Distinct Reads Count' + colors: ["#4b78a3", "#a3514b"] + height: 500 + data: [$2, $5] + + estimated_fragment_size: + type: int + label: "Estimated fragment size" + doc: "Estimated fragment size for downstream analyses" + outputSource: macs2_callpeak/macs2_fragments_calculated + + mapped_reads_number: + type: int + label: "Mapped reads number" + doc: "Mapped reads number for downstream analyses" + outputSource: get_stat/mapped_reads + + +steps: + + extract_fastq: + label: "Loading unmapped sequence data" + doc: | + Most DNA cores and commercial NGS companies return unmapped sequence data in FASTQ format. + The data can be uploaded from users computer, downloaded directly from an ftp server of + the core facility by providing a URL or from GEO by providing SRA accession number. + run: ../tools/extract-fastq.cwl + in: + compressed_file: fastq_file + output_prefix: + default: "read_1" + out: [fastq_file] + + fastx_quality_stats: + label: "Quality control of unmapped sequence data" + doc: | + Evaluates the quality of your sequence data. Provides per base quality scores as well as + base frequencies along the reads. These metrics can be used to identify whether your data + has any problems that should be taken into account in the subsequent analysis steps. + run: ../tools/fastx-quality-stats.cwl + in: + input_file: extract_fastq/fastq_file + out: [statistics_file] + + bowtie_aligner: + label: "Alignment to reference genome" + doc: | + Aligns reads to the reference genome. + Reads are assumed to be mapped if they + have less than 3 mismatches. + sam_file output includes both mapped + and unmapped reads. + run: ../tools/bowtie-alignreads.cwl + in: + upstream_filelist: extract_fastq/fastq_file + indices_folder: indices_folder + clip_3p_end: clip_3p_end + clip_5p_end: clip_5p_end + v: + default: 3 + m: + default: 1 + best: + default: true + strata: + default: true + sam: + default: true + unaligned_prefix: + default: "unaligned_reads" + multimapped_prefix: + default: "multimapped_reads" + threads: threads + q: + default: true + X: + default: 500 + out: + - sam_file + - log_file + - unaligned_fastq + - multimapped_fastq + + samtools_sort_index: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: bowtie_aligner/sam_file + threads: threads + out: [bam_bai_pair] + + samtools_mark_duplicates: + run: ../tools/samtools-markdup.cwl + in: + bam_bai_pair: samtools_sort_index/bam_bai_pair + keep_duplicates: + default: true + threads: threads + out: [deduplicated_bam_bai_pair] + + clean_sam_headers_for_preseq: + run: ../tools/samtools-clean-headers.cwl + in: + bam_file: samtools_mark_duplicates/deduplicated_bam_bai_pair + out: [preseq_bam] + + preseq: + label: "Sequencing depth estimation" + doc: | + Estimates the complexity of the sequencing library, evaluates how many reads can + be expected from the additional sequencing of the same experiment. + run: ../tools/preseq-lc-extrap.cwl + in: + bam_file: clean_sam_headers_for_preseq/preseq_bam + extrapolation: + default: 1000000000 + out: [estimates_file, log_file_stdout, log_file_stderr] + + samtools_remove_duplicates: + run: ../tools/samtools-markdup.cwl + in: + bam_bai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair + keep_duplicates: + source: remove_duplicates + valueFrom: $(!self) + threads: threads + out: [deduplicated_bam_bai_pair] + + macs2_callpeak: + label: "Peak detection" + doc: | + Identifies enriched with aligned reads genome areas. Those areas correspond to the + transcription factor binding sites. + run: ../tools/macs2-callpeak-biowardrobe-only.cwl + in: + treatment_file: samtools_remove_duplicates/deduplicated_bam_bai_pair + control_file: control_file + nolambda: + source: control_file + valueFrom: $(!self) + genome_size: genome_size + mfold: + default: "4 40" + verbose: + default: 3 + nomodel: force_fragment_size + extsize: exp_fragment_size + bw: exp_fragment_size + broad: broad_peak + call_summits: + source: broad_peak + valueFrom: $(!self) + keep_dup: + default: auto + q_value: peak_calling_fdr + format_mode: + default: BAM + buffer_size: + default: 10000 + out: + - peak_xls_file + - narrow_peak_file + - broad_peak_file + - macs2_fragments_calculated + + bam_to_bigwig: + run: ../tools/bam-bedgraph-bigwig.cwl + in: + bam_file: samtools_remove_duplicates/deduplicated_bam_bai_pair + chrom_length_file: chrom_length + mapped_reads_number: get_stat/mapped_reads + fragment_size: macs2_callpeak/macs2_fragments_calculated + out: [bigwig_file] + + get_bam_statistics: + label: "Quality control of aligned sequence data" + doc: | + Calculates alignment statistics, such as reads mapped/unmapped, average + read length and quality score, etc. + run: ../tools/samtools-stats.cwl + in: + bambai_pair: samtools_mark_duplicates/deduplicated_bam_bai_pair + output_filename: + source: samtools_mark_duplicates/deduplicated_bam_bai_pair + valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") + out: [log_file] + + get_bam_statistics_after_filtering: + run: ../tools/samtools-stats.cwl + in: + bambai_pair: samtools_remove_duplicates/deduplicated_bam_bai_pair + output_filename: + source: samtools_remove_duplicates/deduplicated_bam_bai_pair + valueFrom: $(get_root(self.basename)+"_bam_statistics_report_after_filtering.txt") + out: [log_file, reads_mapped] + + get_stat: + run: ../tools/collect-statistics-chip-seq.cwl + in: + bowtie_alignment_report: bowtie_aligner/log_file + bam_statistics_report: get_bam_statistics/log_file + bam_statistics_after_filtering_report: get_bam_statistics_after_filtering/log_file + macs2_called_peaks: macs2_callpeak/peak_xls_file + atdp_results: average_tag_density/result_file + preseq_results: preseq/estimates_file + out: [collected_statistics_yaml, collected_statistics_tsv, mapped_reads, collected_statistics_md] + + preseq_plot_data: + label: "Formats sequencing depth estimation data for plotting" + doc: | + Formats estimates file from preseq standard output for QC plotting. This adds a new + column that includes the actual read count point on the plot. + run: ../tools/preseq-plot-data.cwl + in: + preseq_stderr_log_file: preseq/log_file_stderr + estimates_file: preseq/estimates_file + mapped_reads: get_stat/mapped_reads + out: [estimates_file_plot_data] + + island_intersect: + label: "Peak annotation" + doc: | + Assigns nearest genes to peaks to explore the biological implication of the open + chromatin binding sites. + run: ../tools/iaintersect.cwl + in: + input_filename: macs2_callpeak/peak_xls_file + annotation_filename: annotation_file + promoter_bp: promoter_dist + upstream_bp: upstream_dist + out: [result_file] + + samtools_sort_index_for_atdp: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: samtools_remove_duplicates/deduplicated_bam_bai_pair + threads: threads + out: [bam_bai_pair] + + average_tag_density: + label: "Read enrichment around genes TSS" + doc: | + Generates average tag density plot around genes TSS as a lot of cis-regulatory + elements are close to the TSS of their targets. + run: ../tools/atdp.cwl + in: + input_file: samtools_sort_index_for_atdp/bam_bai_pair + annotation_filename: annotation_file + fragmentsize_bp: macs2_callpeak/macs2_fragments_calculated + avd_window_bp: + default: 5000 + avd_smooth_bp: + default: 50 + ignore_chr: + default: chrM + double_chr: + default: "chrX chrY" + avd_heat_window_bp: + default: 200 + mapped_reads: + source: get_bam_statistics_after_filtering/reads_mapped + valueFrom: $(parseInt(self)) + out: [result_file] + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Deprecated. ChIP-Seq pipeline single-read" +s:name: "Deprecated. ChIP-Seq pipeline single-read" +s:alternateName: "ChIP-Seq basic analysis workflow for single-read data" + +s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/chipseq-se.cwl +s:codeRepository: https://github.com/datirium/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:michael.kotliar@cchmc.org + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + +# doc: +# $include: ../descriptions/chipseq-se.md + + +doc: | + # ChIP-Seq basic analysis workflow for single-read data + + Reads are aligned to the reference genome with [Bowtie](http://bowtie-bio.sourceforge.net/index.shtml). Results are saved as coordinate sorted [BAM](http://samtools.github.io/hts-specs/SAMv1.pdf) alignment and index BAI files. Optionally, PCR duplicates can be removed. To obtain coverage in [bigWig](https://genome.ucsc.edu/goldenpath/help/bigWig.html) format, average fragment length is calculated by [MACS2](https://github.com/taoliu/MACS), and individual reads are extended to this length in the 3’ direction. Areas of enrichment identified by MACS2 are saved in ENCODE [narrow peak](http://genome.ucsc.edu/FAQ/FAQformat.html#format12) or [broad peak](https://genome.ucsc.edu/FAQ/FAQformat.html#format13) formats. Called peaks together with the nearest genes are saved in TSV format. In addition to basic statistics (number of total/mapped/multi-mapped/unmapped/duplicate reads), pipeline generates several quality control measures. Base frequency plots are used to estimate adapter contamination, a frequent occurrence in low-input ChIP-Seq experiments. Expected distinct reads count from [Preseq](http://smithlabresearch.org/software/preseq/) can be used to estimate read redundancy for a given sequencing depth. Average tag density profiles can be used to estimate ChIP enrichment for promoter proximal histone modifications. Use of different parameters for different antibodies (calling broad or narrow peaks) is possible. Additionally, users can elect to use BAM file from another experiment as control for MACS2 peak calling. + + ## Cite as + + *Kartashov AV, Barski A. BioWardrobe: an integrated platform for analysis of epigenomics and transcriptomics data. Genome Biol. 2015;16(1):158. Published 2015 Aug 7. [doi:10.1186/s13059-015-0720-3](https://www.ncbi.nlm.nih.gov/pubmed/26248465)* + + ## Software versions + + - Bowtie 1.2.0 + - Samtools 1.4 + - Preseq 2.0 + - MACS2 2.1.1.20160309 + - Bedtools 2.26.0 + - UCSC userApps v358 + + ## Inputs + + | ID | Label | Description | Required | Default | Upstream analyses | + | ------------------------- | ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------: | ------- | ------------------------------- | + | **fastq\_file** | FASTQ file | Single-read sequencing data in FASTQ format (fastq, fq, bzip2, gzip, zip) | + | | | + | **indices\_folder** | Genome indices | Directory with the genome indices generated by Bowtie | + | | genome\_indices/bowtie\_indices | + | **annotation\_file** | Genome annotation file | Genome annotation file in TSV format | + | | genome\_indices/annotation | + | **genome\_size** | Effective genome size | The length of the mappable genome (hs, mm, ce, dm or number, for example 2.7e9) | + | | genome\_indices/genome\_size | + | **chrom\_length** | Chromosome lengths file | Chromosome lengths file in TSV format | + | | genome\_indices/chrom\_length | + | **broad\_peak** | Call broad peaks | Make MACS2 call broad peaks by linking nearby highly enriched regions | + | | | + | **control\_file** | Control ChIP-Seq single-read experiment | Indexed BAM file from the ChIP-Seq single-read experiment to be used as a control for MACS2 peak calling | | Null | control\_file/bambai\_pair | + | **exp\_fragment\_size** | Expected fragment size | Expected fragment size for read extenstion towards 3' end if *force\_fragment\_size* was set to True or if calculated by MACS2 fragment size was less that 80 bp | | 150 | | + | **force\_fragment\_size** | Force peak calling with expected fragment size | Make MACS2 don't build the shifting model and use expected fragment size for read extenstion towards 3' end | | False | | + | **clip\_3p\_end** | Clip from 3' end | Number of base pairs to clip from 3' end | | 0 | | + | **clip\_5p\_end** | Clip from 5' end | Number of base pairs to clip from 5' end | | 0 | | + | **remove\_duplicates** | Remove PCR duplicates | Remove PCR duplicates from sorted BAM file | | False | | + | **threads** | Number of threads | Number of threads for those steps that support multithreading | | 2 | | + + + ## Outputs + + | ID | Label | Description | Required | Visualization | + | ------------------------ | ---------------------------------- | ------------------------------------------------------------------------------------ | :------: | ------------------------------------------------------------------ | + | **fastx\_statistics** | FASTQ quality statistics | FASTQ quality statistics in TSV format | + | *Base Frequency* and *Quality Control* plots in *QC Plots* tab | + | **bambai\_pair** | Aligned reads | Coordinate sorted BAM alignment and index BAI files | + | *Nucleotide Sequence Alignments* track in *IGV Genome Browser* tab | + | **bigwig** | Genome coverage | Genome coverage in bigWig format | + | *Genome Coverage* track in *IGV Genome Browser* tab | + | **iaintersect\_result** | Gene annotated peaks | MACS2 peak file annotated with nearby genes | + | *Peak Coordinates* table in *Peak Calling* tab | + | **atdp\_result** | Average Tag Density Plot | Average Tag Density Plot file in TSV format | + | *Average Tag Density Plot* in *QC Plots* tab | + | **macs2\_called\_peaks** | Called peaks | Called peaks file with 1-based coordinates in XLS format | + | | + | **macs2\_narrow\_peaks** | Narrow peaks | Called peaks file in ENCODE narrow peak format | | *Narrow peaks* track in *IGV Genome Browser* tab | + | **macs2\_broad\_peaks** | Broad peaks | Called peaks file in ENCODE broad peak format | | *Broad peaks* track in *IGV Genome Browser* tab | + | **preseq\_estimates** | Expected Distinct Reads Count Plot | Expected distinct reads count file from Preseq in TSV format | | *Expected Distinct Reads Count Plot* in *QC Plots* tab | + | **workflow\_statistics** | Workflow execution statistics | Overall workflow execution statistics from bowtie\_aligner and samtools\_rmdup steps | + | *Overview* tab and experiment's preview | + | **bowtie\_log** | Read alignment log | Read alignment log file from Bowtie | + | | \ No newline at end of file diff --git a/workflows/rnaseq-pe-dutp-mitochondrial.cwl b/workflows/rnaseq-pe-dutp-mitochondrial.cwl new file mode 100644 index 00000000..8c5496fc --- /dev/null +++ b/workflows/rnaseq-pe-dutp-mitochondrial.cwl @@ -0,0 +1,634 @@ +cwlVersion: v1.0 +class: Workflow + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var get_root = function(basename) { + return basename.split('.').slice(0,1).join('.'); + }; + +'sd:metadata': +- "../metadata/rnaseq-header.cwl" + + +'sd:upstream': + genome_indices: "genome-indices.cwl" + +inputs: + +# General inputs + + star_indices_folder: + type: Directory + label: "STAR indices folder" + 'sd:upstreamSource': "genome_indices/star_indices" + doc: "Path to STAR generated indices" + + star_indices_folder_mitochondrial: + type: Directory + label: "STAR indices mitochondrial folder" + 'sd:upstreamSource': "genome_indices/mitochondrial_indices" + doc: "Path to STAR generated indices for mitochondrial dna" + + bowtie_indices_folder: + type: Directory + label: "BowTie Ribosomal Indices" + 'sd:upstreamSource': "genome_indices/ribosomal_indices" + doc: "Path to Bowtie generated indices" + + chrom_length_file: + type: File + label: "Chromosome length file" + format: "http://edamontology.org/format_2330" + 'sd:upstreamSource': "genome_indices/chrom_length" + doc: "Chromosome length file" + + annotation_file: + type: File + label: "Annotation file" + format: + - "http://edamontology.org/format_2306" + - "http://edamontology.org/format_3475" + 'sd:upstreamSource': "genome_indices/annotation" + doc: "GTF or TAB-separated annotation file" + + fastq_file_upstream: + type: File + label: "FASTQ 1 input file" + format: "http://edamontology.org/format_1930" + doc: "Reads data in a FASTQ format, received after paired end sequencing" + + fastq_file_downstream: + type: File + label: "FASTQ 2 input file" + format: "http://edamontology.org/format_1930" + doc: "Reads data in a FASTQ format, received after paired end sequencing" + +# Advanced inputs + + exclude_chr: + type: string? + 'sd:layout': + advanced: true + label: "Chromosome to be excluded in rpkm calculation" + doc: "Chromosome to be excluded in rpkm calculation" + + clip_3p_end: + type: int? + default: 0 + 'sd:layout': + advanced: true + label: "Clip from 3p end" + doc: "Number of bases to clip from the 3p end" + + clip_5p_end: + type: int? + default: 0 + 'sd:layout': + advanced: true + label: "Clip from 5p end" + doc: "Number of bases to clip from the 5p end" + +# System dependent + + threads: + type: int? + default: 2 + 'sd:layout': + advanced: true + label: "Number of threads" + doc: "Number of threads for those steps that support multithreading" + +outputs: + + bigwig_upstream: + type: File + format: "http://edamontology.org/format_3006" + label: "BigWig file" + doc: "Generated BigWig file for (+)strand reads" + outputSource: bam_to_bigwig_upstream/bigwig_file + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + type: 'wig' + name: "(+)strand BigWig" + height: 120 + + bigwig_downstream: + type: File + format: "http://edamontology.org/format_3006" + label: "BigWig file" + doc: "Generated BigWig file for (-)strand reads" + outputSource: bam_to_bigwig_downstream/bigwig_file + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + type: 'wig' + name: "(-)strand BigWig" + height: 120 + + star_final_log: + type: File + format: "http://edamontology.org/format_2330" + label: "STAR final log" + doc: "STAR Log.final.out" + outputSource: star_aligner/log_final + + star_out_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR log out" + doc: "STAR Log.out" + outputSource: star_aligner/log_out + + star_progress_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR progress log" + doc: "STAR Log.progress.out" + outputSource: star_aligner/log_progress + + star_stdout_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR stdout log" + doc: "STAR Log.std.out" + outputSource: star_aligner/log_std + + star_sj_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR sj log" + doc: "STAR SJ.out.tab" + outputSource: star_aligner/log_sj + + fastx_statistics_upstream: + type: File + format: "http://edamontology.org/format_2330" + label: "FASTQ 1 statistics" + doc: "fastx_quality_stats generated FASTQ 1 quality statistics file" + outputSource: fastx_quality_stats_upstream/statistics_file + 'sd:visualPlugins': + - line: + tab: 'QC Plots' + Title: 'FASTQ 1 Base frequency plot' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Frequency' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$13, $14, $15, $16, $17] + - boxplot: + tab: 'QC Plots' + Title: 'FASTQ 1 Quality Control' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Quality score' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$11, $7, $8, $9, $12] + + fastx_statistics_downstream: + type: File + format: "http://edamontology.org/format_2330" + label: "FASTQ 2 statistics" + doc: "fastx_quality_stats generated FASTQ 2 quality statistics file" + outputSource: fastx_quality_stats_downstream/statistics_file + 'sd:visualPlugins': + - line: + tab: 'QC Plots' + Title: 'FASTQ 2 Base frequency plot' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Frequency' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$13, $14, $15, $16, $17] + - boxplot: + tab: 'QC Plots' + Title: 'FASTQ 2 Quality Control' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Quality score' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$11, $7, $8, $9, $12] + + bam_merged_index: + type: File + format: "http://edamontology.org/format_2572" + label: "Coordinate sorted BAM alignment file (+index BAI)" + doc: "Coordinate sorted BAM file and BAI index file" + outputSource: merge_original_and_mitochondrial_index/bam_bai_pair + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + optional: true + type: 'alignment' + format: 'bam' + name: "BAM Track" + displayMode: "SQUISHED" + + bowtie_log: + type: File + format: "http://edamontology.org/format_2330" + label: "Bowtie alignment log" + doc: "Bowtie alignment log file" + outputSource: bowtie_aligner/log_file + + rpkm_isoforms: + type: File + format: "http://edamontology.org/format_3752" + label: "RPKM, grouped by isoforms" + doc: "Calculated rpkm values, grouped by isoforms" + outputSource: rpkm_calculation/isoforms_file + + rpkm_genes: + type: File + format: "http://edamontology.org/format_3475" + label: "RPKM, grouped by gene name" + doc: "Calculated rpkm values, grouped by gene name" + outputSource: group_isoforms/genes_file + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'Gene Expression' + Title: 'RPKM, grouped by gene name' + + rpkm_common_tss: + type: File + format: "http://edamontology.org/format_3475" + label: "RPKM, grouped by common TSS" + doc: "Calculated rpkm values, grouped by common TSS" + outputSource: group_isoforms/common_tss_file + + htseq_count_gene_expression_file: + type: File + format: "http://edamontology.org/format_3475" + label: "HTSeq: read counts grouped by gene_id" + doc: "HTSeq: read counts grouped by gene_id" + outputSource: htseq_count_gene_expression/feature_counts_report_file + + htseq_count_stdout_log: + type: File + format: "http://edamontology.org/format_2330" + label: "HTSeq: stdout log" + doc: "HTSeq: stdout log" + outputSource: htseq_count_gene_expression/stdout_log + + htseq_count_stderr_log: + type: File + format: "http://edamontology.org/format_2330" + label: "HTSeq: stderr log" + doc: "HTSeq: stderr log" + outputSource: htseq_count_gene_expression/stderr_log + + get_stat_log: + type: File? + label: "YAML formatted combined log" + format: "http://edamontology.org/format_3750" + doc: "YAML formatted combined log" + outputSource: get_stat/collected_statistics_yaml + + get_stat_markdown: + type: File? + label: "Markdown formatted combined log" + format: "http://edamontology.org/format_3835" + doc: "Markdown formatted combined log" + outputSource: get_stat/collected_statistics_md + 'sd:visualPlugins': + - markdownView: + tab: 'Overview' + + get_formatted_stats: + type: File? + label: "Bowtie, STAR and GEEP mapping stats" + format: "http://edamontology.org/format_2330" + doc: "Processed and combined Bowtie & STAR aligner and GEEP logs" + outputSource: get_stat/collected_statistics_tsv + 'sd:visualPlugins': + - tableView: + vertical: true + tab: 'Overview' + 'sd:preview': + 'sd:visualPlugins': + - pie: + colors: ['#b3de69', '#99c0db', '#fdc381', '#fb8072'] + data: [$2, $3, $4, $5] + + bam_statistics_report: + type: File + label: "BAM statistics report" + format: "http://edamontology.org/format_2330" + doc: "BAM statistics report (right after alignment and sorting)" + outputSource: get_bam_statistics/log_file + + insert_size_report: + type: File + label: "Insert size distribution report" + format: "http://edamontology.org/format_3475" + doc: "Insert size distribution report (right after alignment and sorting)" + outputSource: get_bam_statistics/ext_is_section + 'sd:visualPlugins': + - scatter: + tab: 'QC Plots' + Title: 'Insert Size Distribution' + xAxisTitle: 'Insert size' + yAxisTitle: 'Pairs total' + colors: ["#4b78a3"] + height: 500 + data: [$1, $2] + comparable: "isdp" + + +steps: + + extract_fastq_upstream: + run: ../tools/extract-fastq.cwl + in: + output_prefix: + default: "read_1" + compressed_file: fastq_file_upstream + out: [fastq_file] + + extract_fastq_downstream: + run: ../tools/extract-fastq.cwl + in: + output_prefix: + default: "read_2" + compressed_file: fastq_file_downstream + out: [fastq_file] + + fastx_quality_stats_upstream: + run: ../tools/fastx-quality-stats.cwl + in: + input_file: extract_fastq_upstream/fastq_file + out: [statistics_file] + + fastx_quality_stats_downstream: + run: ../tools/fastx-quality-stats.cwl + in: + input_file: extract_fastq_downstream/fastq_file + out: [statistics_file] + + star_aligner: + run: ../tools/star-alignreads.cwl + in: + readFilesIn: [extract_fastq_upstream/fastq_file, extract_fastq_downstream/fastq_file] + genomeDir: star_indices_folder + outFilterMultimapNmax: + default: 1 + outFilterMismatchNmax: + default: 5 + alignSJDBoverhangMin: + default: 1 + seedSearchStartLmax: + default: 15 + outReadsUnmapped: + default: "Fastx" + clip3pNbases: clip_3p_end + clip5pNbases: clip_5p_end + threads: threads + out: + - aligned_file + - log_final + - unmapped_mate_1_file + - uniquely_mapped_reads_number + - log_out + - log_progress + - log_std + - log_sj + + star_aligner_mitochondrial: + run: ../tools/star-alignreads.cwl + in: + readFilesIn: star_aligner/unmapped_mate_1_file + genomeDir: star_indices_folder_mitochondrial + outFilterMultimapNmax: + default: 1 + outFilterMismatchNmax: + default: 5 + alignSJDBoverhangMin: + default: 1 + seedSearchStartLmax: + default: 15 + clip3pNbases: clip_3p_end + clip5pNbases: clip_5p_end + threads: threads + out: + - aligned_file + - log_final + - uniquely_mapped_reads_number + - log_out + - log_progress + - log_std + - log_sj + + samtools_sort_index_mitochondrial: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: star_aligner_mitochondrial/aligned_file + sort_output_filename: + source: extract_fastq_upstream/fastq_file + valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'_mitochondrial.bam') + threads: threads + out: [bam_bai_pair] + + samtools_sort_index: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: star_aligner/aligned_file + sort_output_filename: + source: extract_fastq_upstream/fastq_file + valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'_sorted.bam') + threads: threads + out: [bam_bai_pair] + + merge_original_and_mitochondrial: + run: ../tools/samtools-merge.cwl + in: + output_filename: + source: extract_fastq_upstream/fastq_file + valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'_merged.bam') + alignment_files: [ samtools_sort_index/bam_bai_pair, samtools_sort_index_mitochondrial/bam_bai_pair ] + out: [merged_alignment_file] + + merge_original_and_mitochondrial_index: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: merge_original_and_mitochondrial/merged_alignment_file + sort_output_filename: + source: extract_fastq_upstream/fastq_file + valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'.bam') + threads: threads + out: [bam_bai_pair] + + bam_to_bigwig_upstream: + run: ../tools/bam-bedgraph-bigwig.cwl + in: + bam_file: merge_original_and_mitochondrial_index/bam_bai_pair + chrom_length_file: chrom_length_file + mapped_reads_number: star_aligner/uniquely_mapped_reads_number + bigwig_filename: + source: extract_fastq_upstream/fastq_file + valueFrom: | + ${ + var root = self.basename.split('.').slice(0,-1).join('.'); + var ext = "_upstream.bigWig"; + return (root == "")?self.basename+ext:root+ext; + } + strand: + default: '+' + out: [bigwig_file] + + bam_to_bigwig_downstream: + run: ../tools/bam-bedgraph-bigwig.cwl + in: + bam_file: merge_original_and_mitochondrial_index/bam_bai_pair + chrom_length_file: chrom_length_file + mapped_reads_number: + source: star_aligner/uniquely_mapped_reads_number + valueFrom: $(-self) + bigwig_filename: + source: extract_fastq_upstream/fastq_file + valueFrom: | + ${ + var root = self.basename.split('.').slice(0,-1).join('.'); + var ext = "_downstream.bigWig"; + return (root == "")?self.basename+ext:root+ext; + } + strand: + default: '-' + out: [bigwig_file] + + bowtie_aligner: + run: ../tools/bowtie-alignreads.cwl + in: + upstream_filelist: extract_fastq_upstream/fastq_file + downstream_filelist: extract_fastq_downstream/fastq_file + indices_folder: bowtie_indices_folder + clip_3p_end: clip_3p_end + clip_5p_end: clip_5p_end + v: + default: 3 + m: + default: 1 + sam: + default: true + threads: threads + out: [log_file] + + rpkm_calculation: + run: ../tools/geep.cwl + in: + bam_file: merge_original_and_mitochondrial_index/bam_bai_pair + annotation_file: annotation_file + dutp: + default: true + rpkm_threshold: + default: 0.001 + exclude_chr: exclude_chr + threads: threads + out: [isoforms_file] + + group_isoforms: + run: ../tools/group-isoforms.cwl + in: + isoforms_file: rpkm_calculation/isoforms_file + out: + - genes_file + - common_tss_file + + get_annotation_gtf: + run: ../tools/ucsc-genepredtogtf.cwl + in: + annotation_tsv_file: annotation_file + out: + - annotation_gtf_file + + htseq_count_gene_expression: + run: ../tools/htseq-count.cwl + in: + alignment_bam_file: merge_original_and_mitochondrial_index/bam_bai_pair + annotation_gtf_file: get_annotation_gtf/annotation_gtf_file + strand_specific: + default: "reverse" + feature_type: + default: "exon" + feature_id: + default: "gene_id" + out: + - feature_counts_report_file + - stdout_log + - stderr_log + + get_bam_statistics: + run: ../tools/samtools-stats.cwl + in: + bambai_pair: samtools_sort_index/bam_bai_pair + output_filename: + source: samtools_sort_index/bam_bai_pair + valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") + out: [log_file, ext_is_section] + + get_stat: + run: ../tools/collect-statistics-rna-seq.cwl + in: + star_alignment_report: star_aligner/log_final + bowtie_alignment_report: bowtie_aligner/log_file + bam_statistics_report: get_bam_statistics/log_file + isoforms_file: rpkm_calculation/isoforms_file + paired_end: + default: true + out: [collected_statistics_yaml, collected_statistics_tsv, collected_statistics_md] + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +s:name: "RNA-Seq pipeline paired-end stranded mitochondrial" +label: "RNA-Seq pipeline paired-end stranded mitochondrial" +s:alternateName: "RNA-Seq strand specific mitochondrial workflow for pair-end experiment based on BioWardrobe's basic analysis" + +s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/rnaseq-pe-dutp-mitochondrial.cwl +s:codeRepository: https://github.com/datirium/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Datirium, LLC" + s:member: + - class: s:Person + s:name: Artem BArski + s:email: mailto:Artem.Barski@datirum.com + - class: s:Person + s:name: Andrey Kartashov + s:email: mailto:Andrey.Kartashov@datirium.com + s:sameAs: + - id: http://orcid.org/0000-0001-9102-5681 + + +# doc: +# $include: ../descriptions/rnaseq-pe-dutp-mitochondrial.md + + +doc: | + Slightly changed original [BioWardrobe's](https://biowardrobe.com) [PubMed ID:26248465](https://www.ncbi.nlm.nih.gov/pubmed/26248465) + **RNA-Seq** basic analysis for **strand specific pair-end** experiment. + An additional steps were added to map data to mitochondrial chromosome only and then merge the output. + + Experiment files in [FASTQ](http://maq.sourceforge.net/fastq.shtml) format either compressed or not can be used. + + Current workflow should be used only with the pair-end strand specific RNA-Seq data. It performs the following steps: + 1. `STAR` to align reads from input FASTQ file according to the predefined reference indices; generate unsorted BAM file and alignment statistics file + 2. `fastx_quality_stats` to analyze input FASTQ file and generate quality statistics file + 3. `samtools sort` to generate coordinate sorted BAM(+BAI) file pair from the unsorted BAM file obtained on the step 1 (after running STAR) + 5. Generate BigWig file on the base of sorted BAM file + 6. Map input FASTQ file to predefined rRNA reference indices using Bowtie to define the level of rRNA contamination; export resulted statistics to file + 7. Calculate isoform expression level for the sorted BAM file and GTF/TAB annotation file using `GEEP` reads-counting utility; export results to file \ No newline at end of file diff --git a/workflows/rnaseq-pe-dutp.cwl b/workflows/rnaseq-pe-dutp.cwl new file mode 100644 index 00000000..ed05ada3 --- /dev/null +++ b/workflows/rnaseq-pe-dutp.cwl @@ -0,0 +1,591 @@ +cwlVersion: v1.0 +class: Workflow + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var get_root = function(basename) { + return basename.split('.').slice(0,1).join('.'); + }; + +'sd:metadata': + - "../metadata/rnaseq-header.cwl" + +'sd:upstream': + genome_indices: "genome-indices.cwl" + +inputs: + +# General inputs + + star_indices_folder: + type: Directory + label: "STAR indices folder" + 'sd:upstreamSource': "genome_indices/star_indices" + doc: "Path to STAR generated indices" + + bowtie_indices_folder: + type: Directory + label: "BowTie Ribosomal Indices" + 'sd:upstreamSource': "genome_indices/ribosomal_indices" + doc: "Path to Bowtie generated indices" + + chrom_length_file: + type: File + label: "Chromosome length file" + format: "http://edamontology.org/format_2330" + 'sd:upstreamSource': "genome_indices/chrom_length" + doc: "Chromosome length file" + + annotation_file: + type: File + label: "Annotation file" + format: + - "http://edamontology.org/format_2306" + - "http://edamontology.org/format_3475" + 'sd:upstreamSource': "genome_indices/annotation" + doc: "GTF or TAB-separated annotation file" + + fastq_file_upstream: + type: File + label: "FASTQ 1 input file" + format: "http://edamontology.org/format_1930" + doc: "Reads data in a FASTQ format, received after paired end sequencing" + + fastq_file_downstream: + type: File + label: "FASTQ 2 input file" + format: "http://edamontology.org/format_1930" + doc: "Reads data in a FASTQ format, received after paired end sequencing" + +# Advanced inputs + + exclude_chr: + type: string? + 'sd:layout': + advanced: true + label: "Chromosome to be excluded in rpkm calculation" + doc: "Chromosome to be excluded in rpkm calculation" + + clip_3p_end: + type: int? + default: 0 + 'sd:layout': + advanced: true + label: "Clip from 3p end" + doc: "Number of bases to clip from the 3p end" + + clip_5p_end: + type: int? + default: 0 + 'sd:layout': + advanced: true + label: "Clip from 5p end" + doc: "Number of bases to clip from the 5p end" + +# System dependent + + threads: + type: int? + default: 2 + 'sd:layout': + advanced: true + label: "Number of threads" + doc: "Number of threads for those steps that support multithreading" + +outputs: + + bigwig_upstream: + type: File + format: "http://edamontology.org/format_3006" + label: "BigWig file" + doc: "Generated BigWig file for (+)strand reads" + outputSource: bam_to_bigwig_upstream/bigwig_file + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + type: 'wig' + name: "(+)strand BigWig" + height: 120 + + bigwig_downstream: + type: File + format: "http://edamontology.org/format_3006" + label: "BigWig file" + doc: "Generated BigWig file for (-)strand reads" + outputSource: bam_to_bigwig_downstream/bigwig_file + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + type: 'wig' + name: "(-)strand BigWig" + height: 120 + + star_final_log: + type: File + format: "http://edamontology.org/format_2330" + label: "STAR final log" + doc: "STAR Log.final.out" + outputSource: star_aligner/log_final + + star_out_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR log out" + doc: "STAR Log.out" + outputSource: star_aligner/log_out + + star_progress_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR progress log" + doc: "STAR Log.progress.out" + outputSource: star_aligner/log_progress + + star_stdout_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR stdout log" + doc: "STAR Log.std.out" + outputSource: star_aligner/log_std + + star_sj_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR sj log" + doc: "STAR SJ.out.tab" + outputSource: star_aligner/log_sj + + fastx_statistics_upstream: + type: File + format: "http://edamontology.org/format_2330" + label: "FASTQ 1 statistics" + doc: "fastx_quality_stats generated FASTQ 1 quality statistics file" + outputSource: fastx_quality_stats_upstream/statistics_file + 'sd:visualPlugins': + - line: + tab: 'QC Plots' + Title: 'FASTQ 1 Base frequency plot' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Frequency' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$13, $14, $15, $16, $17] + - boxplot: + tab: 'QC Plots' + Title: 'FASTQ 1 Quality Control' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Quality score' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$11, $7, $8, $9, $12] + + fastx_statistics_downstream: + type: File + format: "http://edamontology.org/format_2330" + label: "FASTQ 2 statistics" + doc: "fastx_quality_stats generated FASTQ 2 quality statistics file" + outputSource: fastx_quality_stats_downstream/statistics_file + 'sd:visualPlugins': + - line: + tab: 'QC Plots' + Title: 'FASTQ 2 Base frequency plot' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Frequency' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$13, $14, $15, $16, $17] + - boxplot: + tab: 'QC Plots' + Title: 'FASTQ 2 Quality Control' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Quality score' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$11, $7, $8, $9, $12] + + bambai_pair: + type: File + format: "http://edamontology.org/format_2572" + label: "Coordinate sorted BAM alignment file (+index BAI)" + doc: "Coordinate sorted BAM file and BAI index file" + outputSource: samtools_sort_index/bam_bai_pair + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + optional: true + type: 'alignment' + format: 'bam' + name: "BAM Track" + displayMode: "SQUISHED" + + bowtie_log: + type: File + format: "http://edamontology.org/format_2330" + label: "Bowtie alignment log" + doc: "Bowtie alignment log file" + outputSource: bowtie_aligner/log_file + + rpkm_isoforms: + type: File + format: "http://edamontology.org/format_3752" + label: "RPKM, grouped by isoforms" + doc: "Calculated rpkm values, grouped by isoforms" + outputSource: rpkm_calculation/isoforms_file + + rpkm_genes: + type: File + format: "http://edamontology.org/format_3475" + label: "RPKM, grouped by gene name" + doc: "Calculated rpkm values, grouped by gene name" + outputSource: group_isoforms/genes_file + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'Gene Expression' + Title: 'RPKM, grouped by gene name' + + rpkm_common_tss: + type: File + format: "http://edamontology.org/format_3475" + label: "RPKM, grouped by common TSS" + doc: "Calculated rpkm values, grouped by common TSS" + outputSource: group_isoforms/common_tss_file + + htseq_count_gene_expression_file: + type: File + format: "http://edamontology.org/format_3475" + label: "HTSeq: read counts grouped by gene_id" + doc: "HTSeq: read counts grouped by gene_id" + outputSource: htseq_count_gene_expression/feature_counts_report_file + + htseq_count_stdout_log: + type: File + format: "http://edamontology.org/format_2330" + label: "HTSeq: stdout log" + doc: "HTSeq: stdout log" + outputSource: htseq_count_gene_expression/stdout_log + + htseq_count_stderr_log: + type: File + format: "http://edamontology.org/format_2330" + label: "HTSeq: stderr log" + doc: "HTSeq: stderr log" + outputSource: htseq_count_gene_expression/stderr_log + + get_stat_log: + type: File? + label: "YAML formatted combined log" + format: "http://edamontology.org/format_3750" + doc: "YAML formatted combined log" + outputSource: get_stat/collected_statistics_yaml + + get_stat_markdown: + type: File? + label: "Markdown formatted combined log" + format: "http://edamontology.org/format_3835" + doc: "Markdown formatted combined log" + outputSource: get_stat/collected_statistics_md + 'sd:visualPlugins': + - markdownView: + tab: 'Overview' + + get_formatted_stats: + type: File? + label: "Bowtie, STAR and GEEP mapping stats" + format: "http://edamontology.org/format_2330" + doc: "Processed and combined Bowtie & STAR aligner and GEEP logs" + outputSource: get_stat/collected_statistics_tsv + 'sd:visualPlugins': + - tableView: + vertical: true + tab: 'Overview' + 'sd:preview': + 'sd:visualPlugins': + - pie: + colors: ['#b3de69', '#99c0db', '#fdc381', '#fb8072'] + data: [$2, $3, $4, $5] + + bam_statistics_report: + type: File + label: "BAM statistics report" + format: "http://edamontology.org/format_2330" + doc: "BAM statistics report (right after alignment and sorting)" + outputSource: get_bam_statistics/log_file + + insert_size_report: + type: File + label: "Insert size distribution report" + format: "http://edamontology.org/format_3475" + doc: "Insert size distribution report (right after alignment and sorting)" + outputSource: get_bam_statistics/ext_is_section + 'sd:visualPlugins': + - scatter: + tab: 'QC Plots' + Title: 'Insert Size Distribution' + xAxisTitle: 'Insert size' + yAxisTitle: 'Pairs total' + colors: ["#4b78a3"] + height: 500 + data: [$1, $2] + comparable: "isdp" + + +steps: + + extract_fastq_upstream: + run: ../tools/extract-fastq.cwl + in: + output_prefix: + default: "read_1" + compressed_file: fastq_file_upstream + out: [fastq_file] + + extract_fastq_downstream: + run: ../tools/extract-fastq.cwl + in: + output_prefix: + default: "read_2" + compressed_file: fastq_file_downstream + out: [fastq_file] + + star_aligner: + run: ../tools/star-alignreads.cwl + in: + readFilesIn: [extract_fastq_upstream/fastq_file, extract_fastq_downstream/fastq_file] + genomeDir: star_indices_folder + outFilterMultimapNmax: + default: 1 + outFilterMismatchNmax: + default: 5 + alignSJDBoverhangMin: + default: 1 + seedSearchStartLmax: + default: 15 + clip3pNbases: clip_3p_end + clip5pNbases: clip_5p_end + threads: threads + out: + - aligned_file + - log_final + - uniquely_mapped_reads_number + - log_out + - log_progress + - log_std + - log_sj + + fastx_quality_stats_upstream: + run: ../tools/fastx-quality-stats.cwl + in: + input_file: extract_fastq_upstream/fastq_file + out: [statistics_file] + + fastx_quality_stats_downstream: + run: ../tools/fastx-quality-stats.cwl + in: + input_file: extract_fastq_downstream/fastq_file + out: [statistics_file] + + samtools_sort_index: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: star_aligner/aligned_file + sort_output_filename: + source: extract_fastq_upstream/fastq_file + valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'.bam') + threads: threads + out: [bam_bai_pair] + + bam_to_bigwig_upstream: + run: ../tools/bam-bedgraph-bigwig.cwl + in: + bam_file: samtools_sort_index/bam_bai_pair + chrom_length_file: chrom_length_file + mapped_reads_number: + source: star_aligner/uniquely_mapped_reads_number + valueFrom: $(self*2) + bigwig_filename: + source: samtools_sort_index/bam_bai_pair + valueFrom: | + ${ + let root = self.basename.split('.').slice(0,-1).join('.'); + let ext = "_upstream.bigWig"; + return (root == "")?self.basename+ext:root+ext; + } + strand: + default: '+' + dutp: + default: true + out: [bigwig_file] + + bam_to_bigwig_downstream: + run: ../tools/bam-bedgraph-bigwig.cwl + in: + bam_file: samtools_sort_index/bam_bai_pair + chrom_length_file: chrom_length_file + mapped_reads_number: + source: star_aligner/uniquely_mapped_reads_number + valueFrom: $(-self*2) + bigwig_filename: + source: samtools_sort_index/bam_bai_pair + valueFrom: | + ${ + let root = self.basename.split('.').slice(0,-1).join('.'); + let ext = "_downstream.bigWig"; + return (root == "")?self.basename+ext:root+ext; + } + strand: + default: '-' + dutp: + default: true + out: [bigwig_file] + + bowtie_aligner: + run: ../tools/bowtie-alignreads.cwl + in: + upstream_filelist: extract_fastq_upstream/fastq_file + downstream_filelist: extract_fastq_downstream/fastq_file + indices_folder: bowtie_indices_folder + clip_3p_end: clip_3p_end + clip_5p_end: clip_5p_end + v: + default: 3 + m: + default: 1 + sam: + default: true + threads: threads + out: [log_file] + + rpkm_calculation: + run: ../tools/geep.cwl + in: + bam_file: samtools_sort_index/bam_bai_pair + annotation_file: annotation_file + dutp: + default: true + rpkm_threshold: + default: 0.001 + exclude_chr: exclude_chr + threads: threads + out: [isoforms_file] + + group_isoforms: + run: ../tools/group-isoforms.cwl + in: + isoforms_file: rpkm_calculation/isoforms_file + out: + - genes_file + - common_tss_file + + get_annotation_gtf: + run: ../tools/ucsc-genepredtogtf.cwl + in: + annotation_tsv_file: annotation_file + out: + - annotation_gtf_file + + htseq_count_gene_expression: + run: ../tools/htseq-count.cwl + in: + alignment_bam_file: samtools_sort_index/bam_bai_pair + annotation_gtf_file: get_annotation_gtf/annotation_gtf_file + strand_specific: + default: "reverse" + feature_type: + default: "exon" + feature_id: + default: "gene_id" + out: + - feature_counts_report_file + - stdout_log + - stderr_log + + get_bam_statistics: + run: ../tools/samtools-stats.cwl + in: + bambai_pair: samtools_sort_index/bam_bai_pair + output_filename: + source: samtools_sort_index/bam_bai_pair + valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") + out: [log_file, ext_is_section] + + get_stat: + run: ../tools/collect-statistics-rna-seq.cwl + in: + star_alignment_report: star_aligner/log_final + bowtie_alignment_report: bowtie_aligner/log_file + bam_statistics_report: get_bam_statistics/log_file + isoforms_file: rpkm_calculation/isoforms_file + paired_end: + default: true + out: [collected_statistics_yaml, collected_statistics_tsv, collected_statistics_md] + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +s:name: "Deprecated. RNA-Seq pipeline paired-end strand specific" +label: "Deprecated. RNA-Seq pipeline paired-end strand specific" +s:alternateName: "RNA-Seq basic analysis workflow for strand specific paired-end experiment" + +s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/rnaseq-pe-dutp.cwl +s:codeRepository: https://github.com/datirium/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + - class: s:Person + s:name: Andrey Kartashov + s:email: mailto:Andrey.Kartashov@cchmc.org + s:sameAs: + - id: http://orcid.org/0000-0001-9102-5681 + + +# doc: +# $include: ../descriptions/rnaseq-pe-dutp.md + + +doc: | + The original [BioWardrobe's](https://biowardrobe.com) [PubMed ID:26248465](https://www.ncbi.nlm.nih.gov/pubmed/26248465) + **RNA-Seq** basic analysis for a **paired-end** experiment. + A corresponded input [FASTQ](http://maq.sourceforge.net/fastq.shtml) file has to be provided. + + Current workflow should be used only with the paired-end RNA-Seq data. It performs the following steps: + 1. Use STAR to align reads from input FASTQ files according to the predefined reference indices; generate unsorted BAM file and alignment statistics file + 2. Use fastx_quality_stats to analyze input FASTQ files and generate quality statistics files + 3. Use samtools sort to generate coordinate sorted BAM(+BAI) file pair from the unsorted BAM file obtained on the step 1 (after running STAR) + 4. Generate BigWig file on the base of sorted BAM file + 5. Map input FASTQ files to predefined rRNA reference indices using Bowtie to define the level of rRNA contamination; export resulted statistics to file + 6. Calculate isoform expression level for the sorted BAM file and GTF/TAB annotation file using GEEP reads-counting utility; export results to file \ No newline at end of file diff --git a/workflows/rnaseq-pe.cwl b/workflows/rnaseq-pe.cwl new file mode 100644 index 00000000..3e780e30 --- /dev/null +++ b/workflows/rnaseq-pe.cwl @@ -0,0 +1,544 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var get_root = function(basename) { + return basename.split('.').slice(0,1).join('.'); + }; + + +'sd:metadata': + - "../metadata/rnaseq-header.cwl" + +'sd:upstream': + genome_indices: "genome-indices.cwl" + + +inputs: + +# General inputs + + star_indices_folder: + type: Directory + label: "STAR indices folder" + 'sd:upstreamSource': "genome_indices/star_indices" + doc: "Path to STAR generated indices" + + bowtie_indices_folder: + type: Directory + label: "BowTie Ribosomal Indices" + 'sd:upstreamSource': "genome_indices/ribosomal_indices" + doc: "Path to Bowtie generated indices" + + chrom_length_file: + type: File + label: "Chromosome length file" + format: "http://edamontology.org/format_2330" + 'sd:upstreamSource': "genome_indices/chrom_length" + doc: "Chromosome length file" + + annotation_file: + type: File + label: "Annotation file" + format: + - "http://edamontology.org/format_2306" + - "http://edamontology.org/format_3475" + 'sd:upstreamSource': "genome_indices/annotation" + doc: "GTF or TAB-separated annotation file" + + fastq_file_upstream: + type: File + label: "FASTQ 1 input file" + format: "http://edamontology.org/format_1930" + doc: "Reads data in a FASTQ format, received after paired end sequencing" + + fastq_file_downstream: + type: File + label: "FASTQ 2 input file" + format: "http://edamontology.org/format_1930" + doc: "Reads data in a FASTQ format, received after paired end sequencing" + +# Advanced inputs + + exclude_chr: + type: string? + 'sd:layout': + advanced: true + label: "Chromosome to be excluded in rpkm calculation" + doc: "Chromosome to be excluded in rpkm calculation" + + clip_3p_end: + type: int? + default: 0 + 'sd:layout': + advanced: true + label: "Clip from 3p end" + doc: "Number of bases to clip from the 3p end" + + clip_5p_end: + type: int? + default: 0 + 'sd:layout': + advanced: true + label: "Clip from 5p end" + doc: "Number of bases to clip from the 5p end" + +# System dependent + + threads: + type: int? + default: 2 + 'sd:layout': + advanced: true + label: "Number of threads" + doc: "Number of threads for those steps that support multithreading" + + +outputs: + + bigwig: + type: File + format: "http://edamontology.org/format_3006" + label: "BigWig file" + doc: "Generated BigWig file" + outputSource: bam_to_bigwig/bigwig_file + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + type: 'wig' + name: "BigWig Track" + height: 120 + + star_final_log: + type: File + format: "http://edamontology.org/format_2330" + label: "STAR final log" + doc: "STAR Log.final.out" + outputSource: star_aligner/log_final + + star_out_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR log out" + doc: "STAR Log.out" + outputSource: star_aligner/log_out + + star_progress_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR progress log" + doc: "STAR Log.progress.out" + outputSource: star_aligner/log_progress + + star_stdout_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR stdout log" + doc: "STAR Log.std.out" + outputSource: star_aligner/log_std + + star_sj_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR sj log" + doc: "STAR SJ.out.tab" + outputSource: star_aligner/log_sj + + fastx_statistics_upstream: + type: File + format: "http://edamontology.org/format_2330" + label: "FASTQ 1 statistics" + doc: "fastx_quality_stats generated FASTQ 1 quality statistics file" + outputSource: fastx_quality_stats_upstream/statistics_file + 'sd:visualPlugins': + - line: + tab: 'QC Plots' + Title: 'FASTQ 1 Base frequency plot' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Frequency' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$13, $14, $15, $16, $17] + - boxplot: + tab: 'QC Plots' + Title: 'FASTQ 1 Quality Control' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Quality score' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$11, $7, $8, $9, $12] + + fastx_statistics_downstream: + type: File + format: "http://edamontology.org/format_2330" + label: "FASTQ 2 statistics" + doc: "fastx_quality_stats generated FASTQ 2 quality statistics file" + outputSource: fastx_quality_stats_downstream/statistics_file + 'sd:visualPlugins': + - line: + tab: 'QC Plots' + Title: 'FASTQ 2 Base frequency plot' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Frequency' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$13, $14, $15, $16, $17] + - boxplot: + tab: 'QC Plots' + Title: 'FASTQ 2 Quality Control' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Quality score' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$11, $7, $8, $9, $12] + + bambai_pair: + type: File + format: "http://edamontology.org/format_2572" + label: "Coordinate sorted BAM alignment file (+index BAI)" + doc: "Coordinate sorted BAM file and BAI index file" + outputSource: samtools_sort_index/bam_bai_pair + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + optional: true + type: 'alignment' + format: 'bam' + name: "BAM Track" + displayMode: "SQUISHED" + + bowtie_log: + type: File + format: "http://edamontology.org/format_2330" + label: "Bowtie alignment log" + doc: "Bowtie alignment log file" + outputSource: bowtie_aligner/log_file + + rpkm_isoforms: + type: File + format: "http://edamontology.org/format_3752" + label: "RPKM, grouped by isoforms" + doc: "Calculated rpkm values, grouped by isoforms" + outputSource: rpkm_calculation/isoforms_file + + rpkm_genes: + type: File + format: "http://edamontology.org/format_3475" + label: "RPKM, grouped by gene name" + doc: "Calculated rpkm values, grouped by gene name" + outputSource: group_isoforms/genes_file + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'Gene Expression' + Title: 'RPKM, grouped by gene name' + + rpkm_common_tss: + type: File + format: "http://edamontology.org/format_3475" + label: "RPKM, grouped by common TSS" + doc: "Calculated rpkm values, grouped by common TSS" + outputSource: group_isoforms/common_tss_file + + htseq_count_gene_expression_file: + type: File + format: "http://edamontology.org/format_3475" + label: "HTSeq: read counts grouped by gene_id" + doc: "HTSeq: read counts grouped by gene_id" + outputSource: htseq_count_gene_expression/feature_counts_report_file + + htseq_count_stdout_log: + type: File + format: "http://edamontology.org/format_2330" + label: "HTSeq: stdout log" + doc: "HTSeq: stdout log" + outputSource: htseq_count_gene_expression/stdout_log + + htseq_count_stderr_log: + type: File + format: "http://edamontology.org/format_2330" + label: "HTSeq: stderr log" + doc: "HTSeq: stderr log" + outputSource: htseq_count_gene_expression/stderr_log + + get_stat_log: + type: File? + label: "YAML formatted combined log" + format: "http://edamontology.org/format_3750" + doc: "YAML formatted combined log" + outputSource: get_stat/collected_statistics_yaml + + get_stat_markdown: + type: File? + label: "Markdown formatted combined log" + format: "http://edamontology.org/format_3835" + doc: "Markdown formatted combined log" + outputSource: get_stat/collected_statistics_md + 'sd:visualPlugins': + - markdownView: + tab: 'Overview' + + get_formatted_stats: + type: File? + label: "Bowtie, STAR and GEEP mapping stats" + format: "http://edamontology.org/format_2330" + doc: "Processed and combined Bowtie & STAR aligner and GEEP logs" + outputSource: get_stat/collected_statistics_tsv + 'sd:visualPlugins': + - tableView: + vertical: true + tab: 'Overview' + 'sd:preview': + 'sd:visualPlugins': + - pie: + colors: ['#b3de69', '#99c0db', '#fdc381', '#fb8072'] + data: [$2, $3, $4, $5] + + bam_statistics_report: + type: File + label: "BAM statistics report" + format: "http://edamontology.org/format_2330" + doc: "BAM statistics report (right after alignment and sorting)" + outputSource: get_bam_statistics/log_file + + insert_size_report: + type: File + label: "Insert size distribution report" + format: "http://edamontology.org/format_3475" + doc: "Insert size distribution report (right after alignment and sorting)" + outputSource: get_bam_statistics/ext_is_section + 'sd:visualPlugins': + - scatter: + tab: 'QC Plots' + Title: 'Insert Size Distribution' + xAxisTitle: 'Insert size' + yAxisTitle: 'Pairs total' + colors: ["#4b78a3"] + height: 500 + data: [$1, $2] + comparable: "isdp" + +steps: + + extract_fastq_upstream: + run: ../tools/extract-fastq.cwl + in: + output_prefix: + default: "read_1" + compressed_file: fastq_file_upstream + out: [fastq_file] + + extract_fastq_downstream: + run: ../tools/extract-fastq.cwl + in: + output_prefix: + default: "read_2" + compressed_file: fastq_file_downstream + out: [fastq_file] + + star_aligner: + run: ../tools/star-alignreads.cwl + in: + readFilesIn: [extract_fastq_upstream/fastq_file, extract_fastq_downstream/fastq_file] + genomeDir: star_indices_folder + outFilterMultimapNmax: + default: 1 + outFilterMismatchNmax: + default: 5 + alignSJDBoverhangMin: + default: 1 + seedSearchStartLmax: + default: 15 + clip3pNbases: clip_3p_end + clip5pNbases: clip_5p_end + threads: threads + out: + - aligned_file + - log_final + - uniquely_mapped_reads_number + - log_out + - log_progress + - log_std + - log_sj + + fastx_quality_stats_upstream: + run: ../tools/fastx-quality-stats.cwl + in: + input_file: extract_fastq_upstream/fastq_file + out: [statistics_file] + + fastx_quality_stats_downstream: + run: ../tools/fastx-quality-stats.cwl + in: + input_file: extract_fastq_downstream/fastq_file + out: [statistics_file] + + samtools_sort_index: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: star_aligner/aligned_file + sort_output_filename: + source: extract_fastq_upstream/fastq_file + valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'.bam') + threads: threads + out: [bam_bai_pair] + + bam_to_bigwig: + run: ../tools/bam-bedgraph-bigwig.cwl + in: + bam_file: samtools_sort_index/bam_bai_pair + chrom_length_file: chrom_length_file + mapped_reads_number: + source: star_aligner/uniquely_mapped_reads_number + valueFrom: $(self*2) + out: [bigwig_file] + + bowtie_aligner: + run: ../tools/bowtie-alignreads.cwl + in: + upstream_filelist: extract_fastq_upstream/fastq_file + downstream_filelist: extract_fastq_downstream/fastq_file + indices_folder: bowtie_indices_folder + clip_3p_end: clip_3p_end + clip_5p_end: clip_5p_end + v: + default: 3 + m: + default: 1 + sam: + default: true + threads: threads + out: [log_file] + + rpkm_calculation: + run: ../tools/geep.cwl + in: + bam_file: samtools_sort_index/bam_bai_pair + annotation_file: annotation_file + rpkm_threshold: + default: 0.001 + exclude_chr: exclude_chr + threads: threads + out: [isoforms_file] + + group_isoforms: + run: ../tools/group-isoforms.cwl + in: + isoforms_file: rpkm_calculation/isoforms_file + out: + - genes_file + - common_tss_file + + get_annotation_gtf: + run: ../tools/ucsc-genepredtogtf.cwl + in: + annotation_tsv_file: annotation_file + out: + - annotation_gtf_file + + htseq_count_gene_expression: + run: ../tools/htseq-count.cwl + in: + alignment_bam_file: samtools_sort_index/bam_bai_pair + annotation_gtf_file: get_annotation_gtf/annotation_gtf_file + strand_specific: + default: "no" + feature_type: + default: "exon" + feature_id: + default: "gene_id" + out: + - feature_counts_report_file + - stdout_log + - stderr_log + + get_bam_statistics: + run: ../tools/samtools-stats.cwl + in: + bambai_pair: samtools_sort_index/bam_bai_pair + output_filename: + source: samtools_sort_index/bam_bai_pair + valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") + out: [log_file, ext_is_section] + + get_stat: + run: ../tools/collect-statistics-rna-seq.cwl + in: + star_alignment_report: star_aligner/log_final + bowtie_alignment_report: bowtie_aligner/log_file + bam_statistics_report: get_bam_statistics/log_file + isoforms_file: rpkm_calculation/isoforms_file + paired_end: + default: true + out: [collected_statistics_yaml, collected_statistics_tsv, collected_statistics_md] + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +s:name: "Deprecated. RNA-Seq pipeline paired-end" +label: "Deprecated. RNA-Seq pipeline paired-end" +s:alternateName: "RNA-Seq basic analysis workflow for paired-end experiment" + +s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/rnaseq-pe.cwl +s:codeRepository: https://github.com/datirium/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + - class: s:Person + s:name: Andrey Kartashov + s:email: mailto:Andrey.Kartashov@cchmc.org + s:sameAs: + - id: http://orcid.org/0000-0001-9102-5681 + + +# doc: +# $include: ../descriptions/rnaseq-pe.md + + +doc: | + The original [BioWardrobe's](https://biowardrobe.com) [PubMed ID:26248465](https://www.ncbi.nlm.nih.gov/pubmed/26248465) + **RNA-Seq** basic analysis for a **paired-end** experiment. + A corresponded input [FASTQ](http://maq.sourceforge.net/fastq.shtml) file has to be provided. + + Current workflow should be used only with the paired-end RNA-Seq data. It performs the following steps: + 1. Use STAR to align reads from input FASTQ files according to the predefined reference indices; generate unsorted BAM file and alignment statistics file + 2. Use fastx_quality_stats to analyze input FASTQ files and generate quality statistics files + 3. Use samtools sort to generate coordinate sorted BAM(+BAI) file pair from the unsorted BAM file obtained on the step 1 (after running STAR) + 4. Generate BigWig file on the base of sorted BAM file + 5. Map input FASTQ files to predefined rRNA reference indices using Bowtie to define the level of rRNA contamination; export resulted statistics to file + 6. Calculate isoform expression level for the sorted BAM file and GTF/TAB annotation file using GEEP reads-counting utility; export results to file \ No newline at end of file diff --git a/workflows/rnaseq-se-dutp-mitochondrial.cwl b/workflows/rnaseq-se-dutp-mitochondrial.cwl new file mode 100644 index 00000000..a3302916 --- /dev/null +++ b/workflows/rnaseq-se-dutp-mitochondrial.cwl @@ -0,0 +1,574 @@ +cwlVersion: v1.0 +class: Workflow + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var get_root = function(basename) { + return basename.split('.').slice(0,1).join('.'); + }; + +'sd:metadata': + - "../metadata/rnaseq-header.cwl" + + +'sd:upstream': + genome_indices: "genome-indices.cwl" + + +inputs: + +# General inputs + + star_indices_folder: + type: Directory + label: "STAR indices folder" + 'sd:upstreamSource': "genome_indices/star_indices" + doc: "Path to STAR generated indices" + + star_indices_folder_mitochondrial: + type: Directory + label: "STAR indices mitochondrial folder" + 'sd:upstreamSource': "genome_indices/mitochondrial_indices" + doc: "Path to STAR generated indices for mitochondrial dna" + + bowtie_indices_folder: + type: Directory + label: "BowTie Ribosomal Indices" + 'sd:upstreamSource': "genome_indices/ribosomal_indices" + doc: "Path to Bowtie generated indices" + + chrom_length_file: + type: File + label: "Chromosome length file" + format: "http://edamontology.org/format_2330" + 'sd:upstreamSource': "genome_indices/chrom_length" + doc: "Chromosome length file" + + annotation_file: + type: File + label: "Annotation file" + format: + - "http://edamontology.org/format_2306" + - "http://edamontology.org/format_3475" + 'sd:upstreamSource': "genome_indices/annotation" + doc: "GTF or TAB-separated annotation file" + + fastq_file: + type: File + label: "FASTQ input file" + format: "http://edamontology.org/format_1930" + doc: "Reads data in a FASTQ format" + +# Advanced inputs + + exclude_chr: + type: string? + 'sd:layout': + advanced: true + label: "Chromosome to be excluded in rpkm calculation" + doc: "Chromosome to be excluded in rpkm calculation" + + clip_3p_end: + type: int? + default: 0 + 'sd:layout': + advanced: true + label: "Clip from 3p end" + doc: "Number of bases to clip from the 3p end" + + clip_5p_end: + type: int? + default: 0 + 'sd:layout': + advanced: true + label: "Clip from 5p end" + doc: "Number of bases to clip from the 5p end" + +# System dependent + + threads: + type: int? + default: 2 + 'sd:layout': + advanced: true + label: "Number of threads" + doc: "Number of threads for those steps that support multithreading" + +outputs: + + bigwig_upstream: + type: File + format: "http://edamontology.org/format_3006" + label: "BigWig file" + doc: "Generated BigWig file for (+)strand reads" + outputSource: bam_to_bigwig_upstream/bigwig_file + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + type: 'wig' + name: "(+)strand BigWig" + height: 120 + + bigwig_downstream: + type: File + format: "http://edamontology.org/format_3006" + label: "BigWig file" + doc: "Generated BigWig file for (-)strand reads" + outputSource: bam_to_bigwig_downstream/bigwig_file + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + type: 'wig' + name: "(-)strand BigWig" + height: 120 + + star_final_log: + type: File + format: "http://edamontology.org/format_2330" + label: "STAR final log" + doc: "STAR Log.final.out" + outputSource: star_aligner/log_final + + star_out_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR log out" + doc: "STAR Log.out" + outputSource: star_aligner/log_out + + star_progress_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR progress log" + doc: "STAR Log.progress.out" + outputSource: star_aligner/log_progress + + star_stdout_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR stdout log" + doc: "STAR Log.std.out" + outputSource: star_aligner/log_std + + star_sj_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR sj log" + doc: "STAR SJ.out.tab" + outputSource: star_aligner/log_sj + + fastx_statistics: + type: File + format: "http://edamontology.org/format_2330" + label: "FASTQ statistics" + doc: "fastx_quality_stats generated FASTQ file quality statistics file" + outputSource: fastx_quality_stats/statistics_file + 'sd:visualPlugins': + - line: + tab: 'QC Plots' + Title: 'Base frequency plot' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Frequency' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$13, $14, $15, $16, $17] + - boxplot: + tab: 'QC Plots' + Title: 'Quality Control' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Quality score' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$11, $7, $8, $9, $12] + + bam_merged_index: + type: File + format: "http://edamontology.org/format_2572" + label: "Coordinate sorted BAM alignment file (+index BAI)" + doc: "Coordinate sorted BAM file and BAI index file" + outputSource: merge_original_and_mitochondrial_index/bam_bai_pair + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + optional: true + type: 'alignment' + format: 'bam' + name: "BAM Track" + displayMode: "SQUISHED" + + bowtie_log: + type: File + format: "http://edamontology.org/format_2330" + label: "Bowtie alignment log" + doc: "Bowtie alignment log file" + outputSource: bowtie_aligner/log_file + + rpkm_isoforms: + type: File + format: "http://edamontology.org/format_3752" + label: "RPKM, grouped by isoforms" + doc: "Calculated rpkm values, grouped by isoforms" + outputSource: rpkm_calculation/isoforms_file + + rpkm_genes: + type: File + format: "http://edamontology.org/format_3475" + label: "RPKM, grouped by gene name" + doc: "Calculated rpkm values, grouped by gene name" + outputSource: group_isoforms/genes_file + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'Gene Expression' + Title: 'RPKM, grouped by gene name' + + rpkm_common_tss: + type: File + format: "http://edamontology.org/format_3475" + label: "RPKM, grouped by common TSS" + doc: "Calculated rpkm values, grouped by common TSS" + outputSource: group_isoforms/common_tss_file + + htseq_count_gene_expression_file: + type: File + format: "http://edamontology.org/format_3475" + label: "HTSeq: read counts grouped by gene_id" + doc: "HTSeq: read counts grouped by gene_id" + outputSource: htseq_count_gene_expression/feature_counts_report_file + + htseq_count_stdout_log: + type: File + format: "http://edamontology.org/format_2330" + label: "HTSeq: stdout log" + doc: "HTSeq: stdout log" + outputSource: htseq_count_gene_expression/stdout_log + + htseq_count_stderr_log: + type: File + format: "http://edamontology.org/format_2330" + label: "HTSeq: stderr log" + doc: "HTSeq: stderr log" + outputSource: htseq_count_gene_expression/stderr_log + + get_stat_log: + type: File? + label: "YAML formatted combined log" + format: "http://edamontology.org/format_3750" + doc: "YAML formatted combined log" + outputSource: get_stat/collected_statistics_yaml + + get_stat_markdown: + type: File? + label: "Markdown formatted combined log" + format: "http://edamontology.org/format_3835" + doc: "Markdown formatted combined log" + outputSource: get_stat/collected_statistics_md + 'sd:visualPlugins': + - markdownView: + tab: 'Overview' + + get_formatted_stats: + type: File? + label: "Bowtie, STAR and GEEP mapping stats" + format: "http://edamontology.org/format_2330" + doc: "Processed and combined Bowtie & STAR aligner and GEEP logs" + outputSource: get_stat/collected_statistics_tsv + 'sd:visualPlugins': + - tableView: + vertical: true + tab: 'Overview' + 'sd:preview': + 'sd:visualPlugins': + - pie: + colors: ['#b3de69', '#99c0db', '#fdc381', '#fb8072'] + data: [$2, $3, $4, $5] + + bam_statistics_report: + type: File + label: "BAM statistics report" + format: "http://edamontology.org/format_2330" + doc: "BAM statistics report (right after alignment and sorting)" + outputSource: get_bam_statistics/log_file + + +steps: + + extract_fastq: + run: ../tools/extract-fastq.cwl + in: + compressed_file: fastq_file + out: [fastq_file] + + star_aligner: + run: ../tools/star-alignreads.cwl + in: + readFilesIn: extract_fastq/fastq_file + genomeDir: star_indices_folder + outFilterMultimapNmax: + default: 1 + outFilterMismatchNmax: + default: 5 + alignSJDBoverhangMin: + default: 1 + seedSearchStartLmax: + default: 15 + outReadsUnmapped: + default: "Fastx" + clip3pNbases: clip_3p_end + clip5pNbases: clip_5p_end + threads: threads + out: + - aligned_file + - unmapped_mate_1_file + - log_final + - uniquely_mapped_reads_number + - log_out + - log_progress + - log_std + - log_sj + + star_aligner_mitochondrial: + run: ../tools/star-alignreads.cwl + in: + readFilesIn: star_aligner/unmapped_mate_1_file + genomeDir: star_indices_folder_mitochondrial + outFilterMultimapNmax: + default: 1 + outFilterMismatchNmax: + default: 5 + alignSJDBoverhangMin: + default: 1 + seedSearchStartLmax: + default: 15 + clip3pNbases: clip_3p_end + clip5pNbases: clip_5p_end + threads: threads + out: + - aligned_file + - log_final + - uniquely_mapped_reads_number + - log_out + - log_progress + - log_std + - log_sj + + fastx_quality_stats: + run: ../tools/fastx-quality-stats.cwl + in: + input_file: extract_fastq/fastq_file + out: [statistics_file] + + samtools_sort_index_mitochondrial: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: star_aligner_mitochondrial/aligned_file + sort_output_filename: + source: extract_fastq/fastq_file + valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'_mitochondrial.bam') + threads: threads + out: [bam_bai_pair] + + samtools_sort_index: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: star_aligner/aligned_file + sort_output_filename: + source: extract_fastq/fastq_file + valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'_sorted.bam') + threads: threads + out: [bam_bai_pair] + + merge_original_and_mitochondrial: + run: ../tools/samtools-merge.cwl + in: + output_filename: + source: extract_fastq/fastq_file + valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'_merged.bam') + alignment_files: [ samtools_sort_index/bam_bai_pair, samtools_sort_index_mitochondrial/bam_bai_pair ] + out: [merged_alignment_file] + + merge_original_and_mitochondrial_index: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: merge_original_and_mitochondrial/merged_alignment_file + sort_output_filename: + source: extract_fastq/fastq_file + valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'.bam') + threads: threads + out: [bam_bai_pair] + + bam_to_bigwig_upstream: + run: ../tools/bam-bedgraph-bigwig.cwl + in: + bam_file: merge_original_and_mitochondrial_index/bam_bai_pair + chrom_length_file: chrom_length_file + mapped_reads_number: star_aligner/uniquely_mapped_reads_number + bigwig_filename: + source: extract_fastq/fastq_file + valueFrom: | + ${ + var root = self.basename.split('.').slice(0,-1).join('.'); + var ext = "_upstream.bigWig"; + return (root == "")?self.basename+ext:root+ext; + } + strand: + default: '+' + out: [bigwig_file] + + bam_to_bigwig_downstream: + run: ../tools/bam-bedgraph-bigwig.cwl + in: + bam_file: merge_original_and_mitochondrial_index/bam_bai_pair + chrom_length_file: chrom_length_file + mapped_reads_number: + source: star_aligner/uniquely_mapped_reads_number + valueFrom: $(-self) + bigwig_filename: + source: extract_fastq/fastq_file + valueFrom: | + ${ + var root = self.basename.split('.').slice(0,-1).join('.'); + var ext = "_downstream.bigWig"; + return (root == "")?self.basename+ext:root+ext; + } + strand: + default: '-' + out: [bigwig_file] + + bowtie_aligner: + run: ../tools/bowtie-alignreads.cwl + in: + upstream_filelist: extract_fastq/fastq_file + indices_folder: bowtie_indices_folder + clip_3p_end: clip_3p_end + clip_5p_end: clip_5p_end + v: + default: 3 + m: + default: 1 + best: + default: true + strata: + default: true + sam: + default: true + threads: threads + out: [log_file] + + rpkm_calculation: + run: ../tools/geep.cwl + in: + bam_file: merge_original_and_mitochondrial_index/bam_bai_pair + annotation_file: annotation_file + dutp: + default: true + rpkm_threshold: + default: 0.001 + exclude_chr: exclude_chr + threads: threads + out: [isoforms_file] + + group_isoforms: + run: ../tools/group-isoforms.cwl + in: + isoforms_file: rpkm_calculation/isoforms_file + out: + - genes_file + - common_tss_file + + get_annotation_gtf: + run: ../tools/ucsc-genepredtogtf.cwl + in: + annotation_tsv_file: annotation_file + out: + - annotation_gtf_file + + htseq_count_gene_expression: + run: ../tools/htseq-count.cwl + in: + alignment_bam_file: merge_original_and_mitochondrial_index/bam_bai_pair + annotation_gtf_file: get_annotation_gtf/annotation_gtf_file + strand_specific: + default: "reverse" + feature_type: + default: "exon" + feature_id: + default: "gene_id" + out: + - feature_counts_report_file + - stdout_log + - stderr_log + + get_bam_statistics: + run: ../tools/samtools-stats.cwl + in: + bambai_pair: samtools_sort_index/bam_bai_pair + output_filename: + source: samtools_sort_index/bam_bai_pair + valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") + out: [log_file] + + get_stat: + run: ../tools/collect-statistics-rna-seq.cwl + in: + star_alignment_report: star_aligner/log_final + bowtie_alignment_report: bowtie_aligner/log_file + bam_statistics_report: get_bam_statistics/log_file + isoforms_file: rpkm_calculation/isoforms_file + out: [collected_statistics_yaml, collected_statistics_tsv, collected_statistics_md] + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +s:name: "RNA-Seq pipeline single-read stranded mitochondrial" +label: "RNA-Seq pipeline single-read stranded mitochondrial" +s:alternateName: "RNA-Seq strand specific mitochondrial workflow for single-read experiment based on BioWardrobe's basic analysis" + +s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/rnaseq-se-dutp-mitochondrial.cwl +s:codeRepository: https://github.com/datirium/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Datirium, LLC" + s:member: + - class: s:Person + s:name: Artem BArski + s:email: mailto:Artem.Barski@datirum.com + - class: s:Person + s:name: Andrey Kartashov + s:email: mailto:Andrey.Kartashov@datirium.com + s:sameAs: + - id: http://orcid.org/0000-0001-9102-5681 + + +# doc: +# $include: ../descriptions/rnaseq-se-dutp-mitochondrial.md + + +doc: | + Slightly changed original [BioWardrobe's](https://biowardrobe.com) [PubMed ID:26248465](https://www.ncbi.nlm.nih.gov/pubmed/26248465) + **RNA-Seq** basic analysis for **strand specific single-read** experiment. + An additional steps were added to map data to mitochondrial chromosome only and then merge the output. + + Experiment files in [FASTQ](http://maq.sourceforge.net/fastq.shtml) format either compressed or not can be used. + + Current workflow should be used only with single-read strand specific RNA-Seq data. It performs the following steps: + 1. `STAR` to align reads from input FASTQ file according to the predefined reference indices; generate unsorted BAM file and alignment statistics file + 2. `fastx_quality_stats` to analyze input FASTQ file and generate quality statistics file + 3. `samtools sort` to generate coordinate sorted BAM(+BAI) file pair from the unsorted BAM file obtained on the step 1 (after running STAR) + 5. Generate BigWig file on the base of sorted BAM file + 6. Map input FASTQ file to predefined rRNA reference indices using Bowtie to define the level of rRNA contamination; export resulted statistics to file + 7. Calculate isoform expression level for the sorted BAM file and GTF/TAB annotation file using `GEEP` reads-counting utility; export results to file diff --git a/workflows/rnaseq-se-dutp.cwl b/workflows/rnaseq-se-dutp.cwl new file mode 100644 index 00000000..74321f63 --- /dev/null +++ b/workflows/rnaseq-se-dutp.cwl @@ -0,0 +1,527 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var get_root = function(basename) { + return basename.split('.').slice(0,1).join('.'); + }; + + +'sd:metadata': + - "../metadata/rnaseq-header.cwl" + +'sd:upstream': + genome_indices: "genome-indices.cwl" + + +inputs: + +# General inputs + + star_indices_folder: + type: Directory + label: "STAR indices folder" + 'sd:upstreamSource': "genome_indices/star_indices" + doc: "Path to STAR generated indices" + + bowtie_indices_folder: + type: Directory + label: "BowTie Ribosomal Indices" + 'sd:upstreamSource': "genome_indices/ribosomal_indices" + doc: "Path to Bowtie generated indices" + + chrom_length_file: + type: File + label: "Chromosome length file" + format: "http://edamontology.org/format_2330" + 'sd:upstreamSource': "genome_indices/chrom_length" + doc: "Chromosome length file" + + annotation_file: + type: File + label: "Annotation file" + format: + - "http://edamontology.org/format_2306" + - "http://edamontology.org/format_3475" + 'sd:upstreamSource': "genome_indices/annotation" + doc: "GTF or TAB-separated annotation file" + + fastq_file: + type: File + label: "FASTQ input file" + format: "http://edamontology.org/format_1930" + doc: "Reads data in a FASTQ format" + +# Advanced inputs + + exclude_chr: + type: string? + 'sd:layout': + advanced: true + label: "Chromosome to be excluded in rpkm calculation" + doc: "Chromosome to be excluded in rpkm calculation" + + clip_3p_end: + type: int? + default: 0 + 'sd:layout': + advanced: true + label: "Clip from 3p end" + doc: "Number of bases to clip from the 3p end" + + clip_5p_end: + type: int? + default: 0 + 'sd:layout': + advanced: true + label: "Clip from 5p end" + doc: "Number of bases to clip from the 5p end" + +# System dependent + + threads: + type: int? + default: 2 + 'sd:layout': + advanced: true + label: "Number of threads" + doc: "Number of threads for those steps that support multithreading" + +outputs: + + bigwig_upstream: + type: File + format: "http://edamontology.org/format_3006" + label: "BigWig file" + doc: "Generated BigWig file for (+)strand reads" + outputSource: bam_to_bigwig_upstream/bigwig_file + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + type: 'wig' + name: "(+)strand BigWig" + height: 120 + + bigwig_downstream: + type: File + format: "http://edamontology.org/format_3006" + label: "BigWig file" + doc: "Generated BigWig file for (-)strand reads" + outputSource: bam_to_bigwig_downstream/bigwig_file + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + type: 'wig' + name: "(-)strand BigWig" + height: 120 + + star_final_log: + type: File + format: "http://edamontology.org/format_2330" + label: "STAR final log" + doc: "STAR Log.final.out" + outputSource: star_aligner/log_final + + star_out_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR log out" + doc: "STAR Log.out" + outputSource: star_aligner/log_out + + star_progress_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR progress log" + doc: "STAR Log.progress.out" + outputSource: star_aligner/log_progress + + star_stdout_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR stdout log" + doc: "STAR Log.std.out" + outputSource: star_aligner/log_std + + star_sj_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR sj log" + doc: "STAR SJ.out.tab" + outputSource: star_aligner/log_sj + + fastx_statistics: + type: File + format: "http://edamontology.org/format_2330" + label: "FASTQ statistics" + doc: "fastx_quality_stats generated FASTQ file quality statistics file" + outputSource: fastx_quality_stats/statistics_file + 'sd:visualPlugins': + - line: + tab: 'QC Plots' + Title: 'Base frequency plot' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Frequency' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$13, $14, $15, $16, $17] + - boxplot: + tab: 'QC Plots' + Title: 'Quality Control' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Quality score' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$11, $7, $8, $9, $12] + + bambai_pair: + type: File + format: "http://edamontology.org/format_2572" + label: "Coordinate sorted BAM alignment file (+index BAI)" + doc: "Coordinate sorted BAM file and BAI index file" + outputSource: samtools_sort_index/bam_bai_pair + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + optional: true + type: 'alignment' + format: 'bam' + name: "BAM Track" + displayMode: "SQUISHED" + + bowtie_log: + type: File + format: "http://edamontology.org/format_2330" + label: "Bowtie alignment log" + doc: "Bowtie alignment log file" + outputSource: bowtie_aligner/log_file + + rpkm_isoforms: + type: File + format: "http://edamontology.org/format_3752" + label: "RPKM, grouped by isoforms" + doc: "Calculated rpkm values, grouped by isoforms" + outputSource: rpkm_calculation/isoforms_file + + rpkm_genes: + type: File + format: "http://edamontology.org/format_3475" + label: "RPKM, grouped by gene name" + doc: "Calculated rpkm values, grouped by gene name" + outputSource: group_isoforms/genes_file + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'Gene Expression' + Title: 'RPKM, grouped by gene name' + + rpkm_common_tss: + type: File + format: "http://edamontology.org/format_3475" + label: "RPKM, grouped by common TSS" + doc: "Calculated rpkm values, grouped by common TSS" + outputSource: group_isoforms/common_tss_file + + htseq_count_gene_expression_file: + type: File + format: "http://edamontology.org/format_3475" + label: "HTSeq: read counts grouped by gene_id" + doc: "HTSeq: read counts grouped by gene_id" + outputSource: htseq_count_gene_expression/feature_counts_report_file + + htseq_count_stdout_log: + type: File + format: "http://edamontology.org/format_2330" + label: "HTSeq: stdout log" + doc: "HTSeq: stdout log" + outputSource: htseq_count_gene_expression/stdout_log + + htseq_count_stderr_log: + type: File + format: "http://edamontology.org/format_2330" + label: "HTSeq: stderr log" + doc: "HTSeq: stderr log" + outputSource: htseq_count_gene_expression/stderr_log + + get_stat_log: + type: File? + label: "YAML formatted combined log" + format: "http://edamontology.org/format_3750" + doc: "YAML formatted combined log" + outputSource: get_stat/collected_statistics_yaml + + get_stat_markdown: + type: File? + label: "Markdown formatted combined log" + format: "http://edamontology.org/format_3835" + doc: "Markdown formatted combined log" + outputSource: get_stat/collected_statistics_md + 'sd:visualPlugins': + - markdownView: + tab: 'Overview' + + get_formatted_stats: + type: File? + label: "Bowtie, STAR and GEEP mapping stats" + format: "http://edamontology.org/format_2330" + doc: "Processed and combined Bowtie & STAR aligner and GEEP logs" + outputSource: get_stat/collected_statistics_tsv + 'sd:visualPlugins': + - tableView: + vertical: true + tab: 'Overview' + 'sd:preview': + 'sd:visualPlugins': + - pie: + colors: ['#b3de69', '#99c0db', '#fdc381', '#fb8072'] + data: [$2, $3, $4, $5] + + bam_statistics_report: + type: File + label: "BAM statistics report" + format: "http://edamontology.org/format_2330" + doc: "BAM statistics report (right after alignment and sorting)" + outputSource: get_bam_statistics/log_file + +steps: + + extract_fastq: + run: ../tools/extract-fastq.cwl + in: + compressed_file: fastq_file + out: [fastq_file] + + star_aligner: + run: ../tools/star-alignreads.cwl + in: + readFilesIn: extract_fastq/fastq_file + genomeDir: star_indices_folder + outFilterMultimapNmax: + default: 1 + outFilterMismatchNmax: + default: 5 + alignSJDBoverhangMin: + default: 1 + seedSearchStartLmax: + default: 15 + clip3pNbases: clip_3p_end + clip5pNbases: clip_5p_end + threads: threads + out: + - aligned_file + - log_final + - uniquely_mapped_reads_number + - log_out + - log_progress + - log_std + - log_sj + + fastx_quality_stats: + run: ../tools/fastx-quality-stats.cwl + in: + input_file: extract_fastq/fastq_file + out: [statistics_file] + + samtools_sort_index: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: star_aligner/aligned_file + sort_output_filename: + source: extract_fastq/fastq_file + valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'.bam') + threads: threads + out: [bam_bai_pair] + + bam_to_bigwig_upstream: + run: ../tools/bam-bedgraph-bigwig.cwl + in: + bam_file: samtools_sort_index/bam_bai_pair + chrom_length_file: chrom_length_file + mapped_reads_number: star_aligner/uniquely_mapped_reads_number + bigwig_filename: + source: samtools_sort_index/bam_bai_pair + valueFrom: | + ${ + let root = self.basename.split('.').slice(0,-1).join('.'); + let ext = "_upstream.bigWig"; + return (root == "")?self.basename+ext:root+ext; + } + strand: + default: '+' + out: [bigwig_file] + + bam_to_bigwig_downstream: + run: ../tools/bam-bedgraph-bigwig.cwl + in: + bam_file: samtools_sort_index/bam_bai_pair + chrom_length_file: chrom_length_file + mapped_reads_number: + source: star_aligner/uniquely_mapped_reads_number + valueFrom: $(-self) + bigwig_filename: + source: samtools_sort_index/bam_bai_pair + valueFrom: | + ${ + let root = self.basename.split('.').slice(0,-1).join('.'); + let ext = "_downstream.bigWig"; + return (root == "")?self.basename+ext:root+ext; + } + strand: + default: '-' + out: [bigwig_file] + + bowtie_aligner: + run: ../tools/bowtie-alignreads.cwl + in: + upstream_filelist: extract_fastq/fastq_file + indices_folder: bowtie_indices_folder + clip_3p_end: clip_3p_end + clip_5p_end: clip_5p_end + v: + default: 3 + m: + default: 1 + best: + default: true + strata: + default: true + sam: + default: true + threads: threads + out: [log_file] + + rpkm_calculation: + run: ../tools/geep.cwl + in: + bam_file: samtools_sort_index/bam_bai_pair + annotation_file: annotation_file + dutp: + default: true + rpkm_threshold: + default: 0.001 + exclude_chr: exclude_chr + threads: threads + out: [isoforms_file] + + group_isoforms: + run: ../tools/group-isoforms.cwl + in: + isoforms_file: rpkm_calculation/isoforms_file + out: + - genes_file + - common_tss_file + + get_annotation_gtf: + run: ../tools/ucsc-genepredtogtf.cwl + in: + annotation_tsv_file: annotation_file + out: + - annotation_gtf_file + + htseq_count_gene_expression: + run: ../tools/htseq-count.cwl + in: + alignment_bam_file: samtools_sort_index/bam_bai_pair + annotation_gtf_file: get_annotation_gtf/annotation_gtf_file + strand_specific: + default: "reverse" + feature_type: + default: "exon" + feature_id: + default: "gene_id" + out: + - feature_counts_report_file + - stdout_log + - stderr_log + + get_bam_statistics: + run: ../tools/samtools-stats.cwl + in: + bambai_pair: samtools_sort_index/bam_bai_pair + output_filename: + source: samtools_sort_index/bam_bai_pair + valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") + out: [log_file] + + get_stat: + run: ../tools/collect-statistics-rna-seq.cwl + in: + star_alignment_report: star_aligner/log_final + bowtie_alignment_report: bowtie_aligner/log_file + bam_statistics_report: get_bam_statistics/log_file + isoforms_file: rpkm_calculation/isoforms_file + out: [collected_statistics_yaml, collected_statistics_tsv, collected_statistics_md] + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +s:name: "Deprecated. RNA-Seq pipeline single-read strand specific" +label: "Deprecated. RNA-Seq pipeline single-read strand specific" +s:alternateName: "RNA-Seq basic analysis workflow for strand specific single-read experiment" + +s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/rnaseq-se-dutp.cwl +s:codeRepository: https://github.com/datirium/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + - class: s:Person + s:name: Andrey Kartashov + s:email: mailto:Andrey.Kartashov@cchmc.org + s:sameAs: + - id: http://orcid.org/0000-0001-9102-5681 + + +# doc: +# $include: ../descriptions/rnaseq-se-dutp.md + + +doc: | + Note: should be updated + The original [BioWardrobe's](https://biowardrobe.com) [PubMed ID:26248465](https://www.ncbi.nlm.nih.gov/pubmed/26248465) + **RNA-Seq** basic analysis for **strand specific single-read** experiment. + A corresponded input [FASTQ](http://maq.sourceforge.net/fastq.shtml) file has to be provided. + + Current workflow should be used only with the single-read RNA-Seq data. It performs the following steps: + 1. Use STAR to align reads from input FASTQ file according to the predefined reference indices; generate unsorted BAM file and alignment statistics file + 2. Use fastx_quality_stats to analyze input FASTQ file and generate quality statistics file + 3. Use samtools sort to generate coordinate sorted BAM(+BAI) file pair from the unsorted BAM file obtained on the step 1 (after running STAR) + 5. Generate BigWig file on the base of sorted BAM file + 6. Map input FASTQ file to predefined rRNA reference indices using Bowtie to define the level of rRNA contamination; export resulted statistics to file + 7. Calculate isoform expression level for the sorted BAM file and GTF/TAB annotation file using GEEP reads-counting utility; export results to file \ No newline at end of file diff --git a/workflows/rnaseq-se.cwl b/workflows/rnaseq-se.cwl new file mode 100644 index 00000000..5d50ecf9 --- /dev/null +++ b/workflows/rnaseq-se.cwl @@ -0,0 +1,480 @@ +cwlVersion: v1.0 +class: Workflow + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var get_root = function(basename) { + return basename.split('.').slice(0,1).join('.'); + }; + + +'sd:metadata': + - "../metadata/rnaseq-header.cwl" + +'sd:upstream': + genome_indices: "genome-indices.cwl" + +inputs: + +# General inputs + + star_indices_folder: + type: Directory + label: "STAR indices folder" + 'sd:upstreamSource': "genome_indices/star_indices" + doc: "Path to STAR generated indices" + + bowtie_indices_folder: + type: Directory + label: "BowTie Ribosomal Indices" + 'sd:upstreamSource': "genome_indices/ribosomal_indices" + doc: "Path to Bowtie generated indices" + + chrom_length_file: + type: File + label: "Chromosome length file" + format: "http://edamontology.org/format_2330" + 'sd:upstreamSource': "genome_indices/chrom_length" + doc: "Chromosome length file" + + annotation_file: + type: File + label: "Annotation file" + format: + - "http://edamontology.org/format_2306" + - "http://edamontology.org/format_3475" + 'sd:upstreamSource': "genome_indices/annotation" + doc: "GTF or TAB-separated annotation file" + + fastq_file: + type: File + label: "FASTQ input file" + format: "http://edamontology.org/format_1930" + doc: "Reads data in a FASTQ format" + +# Advanced inputs + + exclude_chr: + type: string? + 'sd:layout': + advanced: true + label: "Chromosome to be excluded in rpkm calculation" + doc: "Chromosome to be excluded in rpkm calculation" + + clip_3p_end: + type: int? + default: 0 + 'sd:layout': + advanced: true + label: "Clip from 3p end" + doc: "Number of bases to clip from the 3p end" + + clip_5p_end: + type: int? + default: 0 + 'sd:layout': + advanced: true + label: "Clip from 5p end" + doc: "Number of bases to clip from the 5p end" + +# System dependent + + threads: + type: int? + default: 2 + 'sd:layout': + advanced: true + label: "Number of threads" + doc: "Number of threads for those steps that support multithreading" + +outputs: + + bigwig: + type: File + format: "http://edamontology.org/format_3006" + label: "BigWig file" + doc: "Generated BigWig file" + outputSource: bam_to_bigwig/bigwig_file + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + type: 'wig' + name: "BigWig Track" + height: 120 + + star_final_log: + type: File + format: "http://edamontology.org/format_2330" + label: "STAR final log" + doc: "STAR Log.final.out" + outputSource: star_aligner/log_final + + star_out_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR log out" + doc: "STAR Log.out" + outputSource: star_aligner/log_out + + star_progress_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR progress log" + doc: "STAR Log.progress.out" + outputSource: star_aligner/log_progress + + star_stdout_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR stdout log" + doc: "STAR Log.std.out" + outputSource: star_aligner/log_std + + star_sj_log: + type: File? + format: "http://edamontology.org/format_2330" + label: "STAR sj log" + doc: "STAR SJ.out.tab" + outputSource: star_aligner/log_sj + + fastx_statistics: + type: File + format: "http://edamontology.org/format_2330" + label: "FASTQ statistics" + doc: "fastx_quality_stats generated FASTQ file quality statistics file" + outputSource: fastx_quality_stats/statistics_file + 'sd:visualPlugins': + - line: + tab: 'QC Plots' + Title: 'Base frequency plot' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Frequency' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$13, $14, $15, $16, $17] + - boxplot: + tab: 'QC Plots' + Title: 'Quality Control' + xAxisTitle: 'Nucleotide position' + yAxisTitle: 'Quality score' + colors: ["#b3de69", "#888888", "#fb8072", "#fdc381", "#99c0db"] + data: [$11, $7, $8, $9, $12] + + bambai_pair: + type: File + format: "http://edamontology.org/format_2572" + label: "Coordinate sorted BAM alignment file (+index BAI)" + doc: "Coordinate sorted BAM file and BAI index file" + outputSource: samtools_sort_index/bam_bai_pair + 'sd:visualPlugins': + - igvbrowser: + tab: 'IGV Genome Browser' + id: 'igvbrowser' + optional: true + type: 'alignment' + format: 'bam' + name: "BAM Track" + displayMode: "SQUISHED" + + bowtie_log: + type: File + format: "http://edamontology.org/format_2330" + label: "Bowtie alignment log" + doc: "Bowtie alignment log file" + outputSource: bowtie_aligner/log_file + + rpkm_isoforms: + type: File + format: "http://edamontology.org/format_3752" + label: "RPKM, grouped by isoforms" + doc: "Calculated rpkm values, grouped by isoforms" + outputSource: rpkm_calculation/isoforms_file + + rpkm_genes: + type: File + format: "http://edamontology.org/format_3475" + label: "RPKM, grouped by gene name" + doc: "Calculated rpkm values, grouped by gene name" + outputSource: group_isoforms/genes_file + 'sd:visualPlugins': + - syncfusiongrid: + tab: 'Gene Expression' + Title: 'RPKM, grouped by gene name' + + rpkm_common_tss: + type: File + format: "http://edamontology.org/format_3475" + label: "RPKM, grouped by common TSS" + doc: "Calculated rpkm values, grouped by common TSS" + outputSource: group_isoforms/common_tss_file + + htseq_count_gene_expression_file: + type: File + format: "http://edamontology.org/format_3475" + label: "HTSeq: read counts grouped by gene_id" + doc: "HTSeq: read counts grouped by gene_id" + outputSource: htseq_count_gene_expression/feature_counts_report_file + + htseq_count_stdout_log: + type: File + format: "http://edamontology.org/format_2330" + label: "HTSeq: stdout log" + doc: "HTSeq: stdout log" + outputSource: htseq_count_gene_expression/stdout_log + + htseq_count_stderr_log: + type: File + format: "http://edamontology.org/format_2330" + label: "HTSeq: stderr log" + doc: "HTSeq: stderr log" + outputSource: htseq_count_gene_expression/stderr_log + + get_stat_log: + type: File? + label: "YAML formatted combined log" + format: "http://edamontology.org/format_3750" + doc: "YAML formatted combined log" + outputSource: get_stat/collected_statistics_yaml + + get_stat_markdown: + type: File? + label: "Markdown formatted combined log" + format: "http://edamontology.org/format_3835" + doc: "Markdown formatted combined log" + outputSource: get_stat/collected_statistics_md + 'sd:visualPlugins': + - markdownView: + tab: 'Overview' + + get_formatted_stats: + type: File? + label: "Bowtie, STAR and GEEP mapping stats" + format: "http://edamontology.org/format_2330" + doc: "Processed and combined Bowtie & STAR aligner and GEEP logs" + outputSource: get_stat/collected_statistics_tsv + 'sd:visualPlugins': + - tableView: + vertical: true + tab: 'Overview' + 'sd:preview': + 'sd:visualPlugins': + - pie: + colors: ['#b3de69', '#99c0db', '#fdc381', '#fb8072'] + data: [$2, $3, $4, $5] + + bam_statistics_report: + type: File + label: "BAM statistics report" + format: "http://edamontology.org/format_2330" + doc: "BAM statistics report (right after alignment and sorting)" + outputSource: get_bam_statistics/log_file + + +steps: + + extract_fastq: + run: ../tools/extract-fastq.cwl + in: + compressed_file: fastq_file + out: [fastq_file] + + star_aligner: + run: ../tools/star-alignreads.cwl + in: + readFilesIn: extract_fastq/fastq_file + genomeDir: star_indices_folder + outFilterMultimapNmax: + default: 1 + outFilterMismatchNmax: + default: 5 + alignSJDBoverhangMin: + default: 1 + seedSearchStartLmax: + default: 15 + clip3pNbases: clip_3p_end + clip5pNbases: clip_5p_end + threads: threads + out: + - aligned_file + - log_final + - uniquely_mapped_reads_number + - log_out + - log_progress + - log_std + - log_sj + + fastx_quality_stats: + run: ../tools/fastx-quality-stats.cwl + in: + input_file: extract_fastq/fastq_file + out: [statistics_file] + + samtools_sort_index: + run: ../tools/samtools-sort-index.cwl + in: + sort_input: star_aligner/aligned_file + sort_output_filename: + source: extract_fastq/fastq_file + valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+'.bam') + threads: threads + out: [bam_bai_pair] + + bam_to_bigwig: + run: ../tools/bam-bedgraph-bigwig.cwl + in: + bam_file: samtools_sort_index/bam_bai_pair + chrom_length_file: chrom_length_file + mapped_reads_number: star_aligner/uniquely_mapped_reads_number +# fragmentsize is not set (STAR gives only read length). It will be calculated automatically by bedtools genomecov. + out: [bigwig_file] + + bowtie_aligner: + run: ../tools/bowtie-alignreads.cwl + in: + upstream_filelist: extract_fastq/fastq_file + indices_folder: bowtie_indices_folder + clip_3p_end: clip_3p_end + clip_5p_end: clip_5p_end + v: + default: 3 + m: + default: 1 + best: + default: true + strata: + default: true + sam: + default: true + threads: threads + out: [log_file] + + rpkm_calculation: + run: ../tools/geep.cwl + in: + bam_file: samtools_sort_index/bam_bai_pair + annotation_file: annotation_file + rpkm_threshold: + default: 0.001 + exclude_chr: exclude_chr + threads: threads + out: [isoforms_file] + + group_isoforms: + run: ../tools/group-isoforms.cwl + in: + isoforms_file: rpkm_calculation/isoforms_file + out: + - genes_file + - common_tss_file + + get_annotation_gtf: + run: ../tools/ucsc-genepredtogtf.cwl + in: + annotation_tsv_file: annotation_file + out: + - annotation_gtf_file + + htseq_count_gene_expression: + run: ../tools/htseq-count.cwl + in: + alignment_bam_file: samtools_sort_index/bam_bai_pair + annotation_gtf_file: get_annotation_gtf/annotation_gtf_file + strand_specific: + default: "no" + feature_type: + default: "exon" + feature_id: + default: "gene_id" + out: + - feature_counts_report_file + - stdout_log + - stderr_log + + get_bam_statistics: + run: ../tools/samtools-stats.cwl + in: + bambai_pair: samtools_sort_index/bam_bai_pair + output_filename: + source: samtools_sort_index/bam_bai_pair + valueFrom: $(get_root(self.basename)+"_bam_statistics_report.txt") + out: [log_file] + + get_stat: + run: ../tools/collect-statistics-rna-seq.cwl + in: + star_alignment_report: star_aligner/log_final + bowtie_alignment_report: bowtie_aligner/log_file + bam_statistics_report: get_bam_statistics/log_file + isoforms_file: rpkm_calculation/isoforms_file + out: [collected_statistics_yaml, collected_statistics_tsv, collected_statistics_md] + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +s:name: "Deprecated. RNA-Seq pipeline single-read" +label: "Deprecated. RNA-Seq pipeline single-read" +s:alternateName: "RNA-Seq basic analysis workflow for single-read experiment" + +s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/rnaseq-se.cwl +s:codeRepository: https://github.com/datirium/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + - class: s:Person + s:name: Andrey Kartashov + s:email: mailto:Andrey.Kartashov@cchmc.org + s:sameAs: + - id: http://orcid.org/0000-0001-9102-5681 + + +# doc: +# $include: ../descriptions/rnaseq-se.md + + +doc: | + The original [BioWardrobe's](https://biowardrobe.com) [PubMed ID:26248465](https://www.ncbi.nlm.nih.gov/pubmed/26248465) + **RNA-Seq** basic analysis for a **single-read** experiment. + A corresponded input [FASTQ](http://maq.sourceforge.net/fastq.shtml) file has to be provided. + + Current workflow should be used only with the single-read RNA-Seq data. It performs the following steps: + 1. Use STAR to align reads from input FASTQ file according to the predefined reference indices; generate unsorted BAM file and alignment statistics file + 2. Use fastx_quality_stats to analyze input FASTQ file and generate quality statistics file + 3. Use samtools sort to generate coordinate sorted BAM(+BAI) file pair from the unsorted BAM file obtained on the step 1 (after running STAR) + 5. Generate BigWig file on the base of sorted BAM file + 6. Map input FASTQ file to predefined rRNA reference indices using Bowtie to define the level of rRNA contamination; export resulted statistics to file + 7. Calculate isoform expression level for the sorted BAM file and GTF/TAB annotation file using GEEP reads-counting utility; export results to file \ No newline at end of file From 1bdfbec3286b1c5f96c2a36fddae18f5db3fc8cf Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Sat, 27 Jan 2024 18:31:47 -0500 Subject: [PATCH 111/162] Add chr length file to bam-to-bigwig if we used bed file as input --- tools/bam-bedgraph-bigwig.cwl | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/bam-bedgraph-bigwig.cwl b/tools/bam-bedgraph-bigwig.cwl index a0888bda..62fe3a1b 100644 --- a/tools/bam-bedgraph-bigwig.cwl +++ b/tools/bam-bedgraph-bigwig.cwl @@ -93,6 +93,7 @@ steps: pairchip: pairchip fragment_size: fragment_size scale: scale + chrom_length_file: chrom_length_file mapped_reads_number: mapped_reads_number strand: strand du: dutp From 09a017bc5bc8d56e7482aa7b6057fba430330192 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Sat, 27 Jan 2024 18:38:46 -0500 Subject: [PATCH 112/162] Make rename tool to support optional input and output --- tools/rename.cwl | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tools/rename.cwl b/tools/rename.cwl index 52d10ae4..fe8397e7 100644 --- a/tools/rename.cwl +++ b/tools/rename.cwl @@ -21,15 +21,17 @@ inputs: type: string? default: | #!/bin/bash - cp $0 $1 - if [ -f $0.bai ]; then - cp $0.bai $1.bai + if [ -f $0 ]; then + cp $0 $1 + if [ -f $0.bai ]; then + cp $0.bai $1.bai + fi fi inputBinding: position: 1 source_file: - type: File + type: File? inputBinding: position: 5 @@ -43,12 +45,12 @@ inputs: outputs: target_file: - type: File + type: File? outputBinding: glob: $(get_target_name()) secondaryFiles: | ${ - if (inputs.source_file.secondaryFiles && inputs.source_file.secondaryFiles.length > 0){ + if (inputs.source_file && inputs.source_file.secondaryFiles && inputs.source_file.secondaryFiles.length > 0){ return inputs.target_filename+".bai"; } else { return "null"; From f9494f8eeb0f77d6a69005417f2f7b60f32ead8c Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Sat, 27 Jan 2024 19:11:08 -0500 Subject: [PATCH 113/162] Remove unused sc tools --- tools/sc-assign-cell-types.cwl | 279 ------- tools/sc_diff_expr.cwl | 460 ----------- tools/seurat-cluster.cwl | 1349 -------------------------------- 3 files changed, 2088 deletions(-) delete mode 100644 tools/sc-assign-cell-types.cwl delete mode 100644 tools/sc_diff_expr.cwl delete mode 100644 tools/seurat-cluster.cwl diff --git a/tools/sc-assign-cell-types.cwl b/tools/sc-assign-cell-types.cwl deleted file mode 100644 index e2f3fe43..00000000 --- a/tools/sc-assign-cell-types.cwl +++ /dev/null @@ -1,279 +0,0 @@ -cwlVersion: v1.0 -class: CommandLineTool - - -requirements: -- class: InlineJavascriptRequirement - - -hints: -- class: DockerRequirement - dockerPull: biowardrobe2/seurat:v0.0.15 - - -inputs: - - seurat_data_rds: - type: File - inputBinding: - prefix: "--rds" - doc: | - Path to the RDS file to load Seurat object from. - RDS file produced by run_seurat.R script. - - cell_type_data: - type: File - inputBinding: - prefix: "--ctype" - doc: | - Path to the cell types metadata TSV/CSV file with - "cluster" and "type" columns - - source_column: - type: string - inputBinding: - prefix: "--source" - doc: | - Column name to select clusters for cell type assignment - - target_column: - type: string - inputBinding: - prefix: "--target" - doc: | - Column name to store assigned cell types - - selected_features: - type: - - "null" - - string - - string[] - inputBinding: - prefix: "--features" - doc: | - Features of interest to evaluate expression. - Default: None - - output_prefix: - type: string? - inputBinding: - prefix: "--output" - doc: | - Output prefix. - Default: ./seurat - - export_pdf_plots: - type: boolean? - inputBinding: - prefix: "--pdf" - doc: | - Export plots in PDF. - Default: false - - threads: - type: int? - inputBinding: - prefix: "--threads" - doc: | - Threads number - Default: 1 - - -outputs: - - umap_ctype_plot_png: - type: File? - outputBinding: - glob: "*_umap_ctype.png" - doc: | - Grouped by cell type UMAP projected PCA of filtered integrated/scaled datasets. - PNG format - - umap_ctype_plot_pdf: - type: File? - outputBinding: - glob: "*_umap_ctype.pdf" - doc: | - Grouped by cell type UMAP projected PCA of filtered integrated/scaled datasets. - PDF format - - umap_ctype_spl_by_cond_plot_png: - type: File? - outputBinding: - glob: "*_umap_ctype_spl_by_cond.png" - doc: | - Split by condition grouped by cell type UMAP projected PCA of filtered integrated/scaled datasets - PNG format - - umap_ctype_spl_by_cond_plot_pdf: - type: File? - outputBinding: - glob: "*_umap_ctype_spl_by_cond.pdf" - doc: | - Split by condition grouped by cell type UMAP projected PCA of filtered integrated/scaled datasets - PDF format - - expr_avg_per_ctype_plot_png: - type: File? - outputBinding: - glob: "*_avg_per_ctype.png" - doc: | - Scaled average log normalized gene expression per predicted cell type of filtered integrated/scaled datasets - PNG format - - expr_avg_per_ctype_plot_pdf: - type: File? - outputBinding: - glob: "*_avg_per_ctype.pdf" - doc: | - Scaled average log normalized gene expression per predicted cell type of filtered integrated/scaled datasets - PDF format - - expr_per_ctype_cell_plot_png: - type: File? - outputBinding: - glob: "*_per_ctype_cell.png" - doc: | - Log normalized gene expression per cell of clustered filtered integrated/scaled datasets with predicted cell types - PNG format - - expr_per_ctype_cell_plot_pdf: - type: File? - outputBinding: - glob: "*_per_ctype_cell.pdf" - doc: | - Log normalized gene expression per cell of clustered filtered integrated/scaled datasets with predicted cell types - PDF format - - expr_dnst_per_ctype_plot_png: - type: File? - outputBinding: - glob: "*_dnst_per_ctype.png" - doc: | - Log normalized gene expression densities per predicted cell type of filtered integrated/scaled datasets - PNG format - - expr_dnst_per_ctype_plot_pdf: - type: File? - outputBinding: - glob: "*_dnst_per_ctype.pdf" - doc: | - Log normalized gene expression densities per predicted cell type of filtered integrated/scaled datasets - PDF format - - seurat_ctype_data_rds: - type: File? - outputBinding: - glob: "*_ctype_data.rds" - doc: | - Clustered filtered integrated/scaled Seurat data with assigned cell types. - RDS format - - cellbrowser_config_data: - type: Directory? - outputBinding: - glob: "*_cellbrowser" - doc: | - Directory with UCSC Cellbrowser configuration data - - cellbrowser_html_data: - type: Directory? - outputBinding: - glob: "*_cellbrowser/html_data" - doc: | - Directory with UCSC Cellbrowser formatted html data - - cellbrowser_html_file: - type: File? - outputBinding: - glob: "*_cellbrowser/html_data/index.html" - doc: | - HTML index file from the directory with UCSC Cellbrowser formatted html data - - stdout_log: - type: stdout - - stderr_log: - type: stderr - - -baseCommand: ["assign_cell_types.R"] - -stdout: assign_cell_types_stdout.log -stderr: assign_cell_types_stderr.log - - -$namespaces: - s: http://schema.org/ - -$schemas: -- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf - - -label: "Single-cell Assign Cell Types" -s:name: "Single-cell Assign Cell Types" -s:alternateName: "Assigns cell types to Seurat clusters" - -s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-assign-cell-types.cwl -s:codeRepository: https://github.com/Barski-lab/workflows -s:license: http://www.apache.org/licenses/LICENSE-2.0 - -s:isPartOf: - class: s:CreativeWork - s:name: Common Workflow Language - s:url: http://commonwl.org/ - -s:creator: -- class: s:Organization - s:legalName: "Cincinnati Children's Hospital Medical Center" - s:location: - - class: s:PostalAddress - s:addressCountry: "USA" - s:addressLocality: "Cincinnati" - s:addressRegion: "OH" - s:postalCode: "45229" - s:streetAddress: "3333 Burnet Ave" - s:telephone: "+1(513)636-4200" - s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" - s:department: - - class: s:Organization - s:legalName: "Allergy and Immunology" - s:department: - - class: s:Organization - s:legalName: "Barski Research Lab" - s:member: - - class: s:Person - s:name: Michael Kotliar - s:email: mailto:misha.kotliar@gmail.com - s:sameAs: - - id: http://orcid.org/0000-0002-6486-3898 - - -doc: | - Single-cell Assign Cell Types - ================================ - Assigns cell types to Seurat clusters - - -s:about: | - usage: assign_cell_types.R - [-h] --rds RDS --ctype CTYPE --source SOURCE --target TARGET - [--features [FEATURES [FEATURES ...]]] [--output OUTPUT] [--pdf] - [--threads THREADS] - - Assigns cell types to clusters - - optional arguments: - -h, --help show this help message and exit - --rds RDS Path to the RDS file to load Seurat object from. RDS - file produced by run_seurat.R script - --ctype CTYPE Path to the cell types metadata TSV/CSV file with - cluster and type columns - --source SOURCE Column name to select clusters for cell type - assignment - --target TARGET Column name to store assigned cell types - --features [FEATURES [FEATURES ...]] - Features of interest to highlight. Default: None - --output OUTPUT Output prefix. Default: ./seurat - --pdf Export plots in PDF. Default: false - --threads THREADS Threads. Default: 1 diff --git a/tools/sc_diff_expr.cwl b/tools/sc_diff_expr.cwl deleted file mode 100644 index 90fa632c..00000000 --- a/tools/sc_diff_expr.cwl +++ /dev/null @@ -1,460 +0,0 @@ -cwlVersion: v1.0 -class: CommandLineTool - - -requirements: -- class: InlineJavascriptRequirement - - -hints: -- class: DockerRequirement - dockerPull: biowardrobe2/seurat:v0.0.15 - - -inputs: - - seurat_data_rds: - type: File - inputBinding: - prefix: "--rds" - doc: | - Path to the RDS file to load Seurat object from. - RDS file can be produced by run_seurat.R script. - - conditions_data: - type: File? - inputBinding: - prefix: "--condition" - doc: | - Path to the TSV/CSV file to optionally extend Seurat object metadata. First - column 'library_id' should include all unique values from the 'new.ident' - column of the loaded from --rds Seurat object metadata. All other columns will - be added to the Seurat object metadata. If any of the provided in this file - columns were already present in the Seurat object metadata, they will be - overwritten. - Default: no metadata columns will be added or overwritten - - splitby: - type: string - inputBinding: - prefix: "--splitby" - doc: | - Column from the Seurat object metadata to split cells into two groups - to run --second vs --first differential expression analysis. May include - columns from the metadata fields added with --condition. - - first_cond: - type: string - inputBinding: - prefix: "--first" - doc: | - Value from the Seurat object metadata column set with --splitby to define the - first group of cells or pseudobulk RNA-Seq samples (when using --pseudo). - - second_cond: - type: string - inputBinding: - prefix: "--second" - doc: | - Value from the Seurat object metadata column set with --splitby to define the - the second group of cells or pseudobulk RNA-Seq samples (when using --pseudo). - - batchby: - type: string? - inputBinding: - prefix: "--batchby" - doc: | - Column from the Seurat object metadata to define the variable that should - be modelled as a batch effect when running differential expression analysis. - Applied only when --testuse is one of 'LR', 'negbinom', 'poisson', or 'MAST', - or when using --pseudo. May include columns from the metadata fields added - with --condition. Values selected from the column set with --batchby should - establish 1:1 relation with the 'new.ident' column of the Seurat object loaded - from --rds. - Default: do not model batch effect. - - groupby: - type: string? - inputBinding: - prefix: "--groupby" - doc: | - Column from the Seurat object metadata to group cells for optional - subsetting (for example, subset to the specific cluster or predicted - cell type). May include columns from the metadata fields added with - --condition. - - selected_groups: - type: - - "null" - - string - - string[] - inputBinding: - prefix: "--select" - doc: | - Value(s) from the column set with --groupby to optionally subset cells - before running differential expression analysis. - Default: do not subset, use all cells. - - selected_features: - type: - - "null" - - string - - string[] - inputBinding: - prefix: "--genes" - doc: | - Genes of interest to label on the generated plots. - Default: --topn N genes with the highest and the - lowest log2 fold change expression values. - - excluded_features: - type: - - "null" - - string - - string[] - inputBinding: - prefix: "--exgenes" - doc: | - Genes to be excluded from the differential expression analysis. - Default: include all genes - - topn_genes_count: - type: int? - inputBinding: - prefix: "--topn" - doc: | - Show N genes with the highest and N genes with the lowest log2 fold - change expression values. Ignored with --genes. - Default: 10 - - minimum_logfc: - type: float? - inputBinding: - prefix: "--minlogfc" - doc: | - Include only those genes that on average have the absolute value of log2 - fold change expression difference not lower than this value. Increasing - --minlogfc speeds up calculations, but can cause missing weaker signals. - Ignored with --pseudo. - Default: 0.25 - - minimum_pct: - type: float? - inputBinding: - prefix: "--minpct" - doc: | - Include only those genes that are detected in not lower than this fraction of cells - in either of the two tested groups. Increasing --minpct speeds up calculations by not - testing genes that are very infrequently expressed. Ignored with --pseudo. - Default: 0.1 - - maximum_pvadj: - type: float? - inputBinding: - prefix: "--maxpvadj" - doc: | - Include only those genes for which adjusted P-val is not bigger that this value. - Default: 0.1 - - test_use: - type: - - "null" - - type: enum - symbols: - - "wilcox" - - "bimod" - - "roc" - - "t" - - "negbinom" - - "poisson" - - "LR" - - "MAST" - - "DESeq2" - inputBinding: - prefix: "--testuse" - doc: | - Statistical test to use for differential gene expression analysis. - Ignored with --pseudo. - Default: wilcox - - pseudo: - type: boolean? - inputBinding: - prefix: "--pseudo" - doc: | - Aggregate gene expression of the cells from the same dataset into a pseudobulk - RNA-Seq sample before running differential expression analysis with DESeq2. - The following parameters will be ignored: --testuse, --minpct, --minlogfc. - Default: false - - lrt: - type: boolean? - inputBinding: - prefix: "--lrt" - doc: | - Use LRT instead of the pair-wise Wald test. Shows any differences across the variable - set with --batchby whith the log2 fold changes calculated as the average expression - changes due to criteria set with --splitby. Ignored when --pseudo or --batchby - parameters are not provided. - Default: use Wald test - - export_pdf_plots: - type: boolean? - inputBinding: - prefix: "--pdf" - doc: | - Export plots in PDF. - Default: false - - output_prefix: - type: string? - inputBinding: - prefix: "--output" - doc: | - Output prefix. - Default: ./seurat - - threads: - type: int? - inputBinding: - prefix: "--threads" - doc: | - Threads number - Default: 1 - - -outputs: - - cell_abundance_plot_png: - type: File? - outputBinding: - glob: "*_umap.png" - doc: | - Cell abundance plot split by criteria set in splitby (a.k.a condition) and optionally - subsetted by selected_groups (a.k.a clusters) from the groups defined in groupby. - PNG format - - cell_abundance_plot_pdf: - type: File? - outputBinding: - glob: "*_umap.pdf" - doc: | - Cell abundance plot split by criteria set in splitby (a.k.a condition) and optionally - subsetted by selected_groups (a.k.a clusters) from the groups defined in groupby. - PDF format - - aggr_gene_expr_plot_png: - type: File? - outputBinding: - glob: "*_counts.png" - doc: | - Log normalized aggregated gene expression split by criteria set in splitby - (a.k.a condition). - PNG format - - aggr_gene_expr_plot_pdf: - type: File? - outputBinding: - glob: "*_counts.pdf" - doc: | - Log normalized aggregated gene expression split by criteria set in splitby - (a.k.a condition). - PDF format - - diff_expr_genes_plot_png: - type: File? - outputBinding: - glob: "*_diff_expr_genes.png" - doc: | - Volcano plot of differentially expressed genes for second_cond vs first_cond cells - or pseudobulk RNA-Seq samples split by criteria set in splitby (a.k.a condition) - and optionally subsetted by selected_groups (a.k.a clusters) from the groups defined - in groupby. - PNG format - - diff_expr_genes_plot_pdf: - type: File? - outputBinding: - glob: "*_diff_expr_genes.pdf" - doc: | - Volcano plot of differentially expressed genes for second_cond vs first_cond cells - or pseudobulk RNA-Seq samples split by criteria set in splitby (a.k.a condition) - and optionally subsetted by selected_groups (a.k.a clusters) from the groups defined - in groupby. - PDF format - - diff_expr_genes: - type: File - outputBinding: - glob: "*_diff_expr_genes.tsv" - doc: | - Differentially expressed genes for second_cond vs first_cond cells or pseudobulk - RNA-Seq samples split by criteria set in splitby (a.k.a condition) and optionally - subsetted by selected_groups (a.k.a clusters) from the groups defined in groupby. - TSV format - - stdout_log: - type: stdout - - stderr_log: - type: stderr - - -baseCommand: ["sc_diff_expr.R"] - - -stdout: seurat_diff_expr_stdout.log -stderr: seurat_diff_expr_stderr.log - - -$namespaces: - s: http://schema.org/ - -$schemas: -- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf - - -label: "Single-cell Differential Expression Analysis" -s:name: "Single-cell Differential Expression Analysis" -s:alternateName: "Runs differential expression analysis for a subset of cells between two selected conditions" - -s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc_diff_expr.cwl -s:codeRepository: https://github.com/Barski-lab/workflows -s:license: http://www.apache.org/licenses/LICENSE-2.0 - -s:isPartOf: - class: s:CreativeWork - s:name: Common Workflow Language - s:url: http://commonwl.org/ - -s:creator: -- class: s:Organization - s:legalName: "Cincinnati Children's Hospital Medical Center" - s:location: - - class: s:PostalAddress - s:addressCountry: "USA" - s:addressLocality: "Cincinnati" - s:addressRegion: "OH" - s:postalCode: "45229" - s:streetAddress: "3333 Burnet Ave" - s:telephone: "+1(513)636-4200" - s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" - s:department: - - class: s:Organization - s:legalName: "Allergy and Immunology" - s:department: - - class: s:Organization - s:legalName: "Barski Research Lab" - s:member: - - class: s:Person - s:name: Michael Kotliar - s:email: mailto:misha.kotliar@gmail.com - s:sameAs: - - id: http://orcid.org/0000-0002-6486-3898 - - -doc: | - Single-cell Differential Expression Analysis - ============================================= - - Runs differential expression analysis for a subset of cells between two selected conditions - - -s:about: | - usage: /Users/kot4or/workspaces/cwl_ws/workflows/tools/dockerfiles/scripts/sc_diff_expr.R - [-h] --rds RDS [--condition CONDITION] --splitby SPLITBY --first FIRST - --second SECOND [--batchby BATCHBY] [--groupby GROUPBY] - [--select [SELECT ...]] [--genes [GENES ...]] [--exgenes [EXGENES ...]] - [--topn TOPN] [--minlogfc MINLOGFC] [--minpct MINPCT] - [--maxpvadj MAXPVADJ] - [--testuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] - [--pseudo] [--lrt] [--pdf] [--output OUTPUT] [--threads THREADS] - - Differential expression analysis for a subset of cells between two selected - conditions - - optional arguments: - -h, --help show this help message and exit - --rds RDS Path to the RDS file to load Seurat object from. RDS - file can be produced by run_seurat.R script. - --condition CONDITION - Path to the TSV/CSV file to optionally extend Seurat - object metadata. First column 'library_id' should - include all unique values from the 'new.ident' column - of the loaded from --rds Seurat object metadata. All - other columns will be added to the Seurat object - metadata. If any of the provided in this file columns - were already present in the Seurat object metadata, - they will be overwritten. Default: no metadata columns - will be added or overwritten - --splitby SPLITBY Column from the Seurat object metadata to split cells - into two groups to run --second vs --first - differential expression analysis. May include columns - from the metadata fields added with --condition. - --first FIRST Value from the Seurat object metadata column set with - --splitby to define the first group of cells or - pseudobulk RNA-Seq samples (when using --pseudo). - --second SECOND Value from the Seurat object metadata column set with - --splitby to define the the second group of cells or - pseudobulk RNA-Seq samples (when using --pseudo) - --batchby BATCHBY Column from the Seurat object metadata to define the - variable that should be modelled as a batch effect - when running differential expression analysis. Applied - only when --testuse is one of 'LR', 'negbinom', - 'poisson', or 'MAST', or when using --pseudo. May - include columns from the metadata fields added with - --condition. Values selected from the column set with - --batchby should establish 1:1 relation with the - 'new.ident' column of the Seurat object loaded from - --rds. Default: do not model batch effect. - --groupby GROUPBY Column from the Seurat object metadata to group cells - for optional subsetting (for example, subset to the - specific cluster or predicted cell type). May include - columns from the metadata fields added with - --condition. - --select [SELECT ...] - Value(s) from the column set with --groupby to - optionally subset cells before running differential - expression analysis. Default: do not subset, use all - cells. - --genes [GENES ...] Genes of interest to label on the generated plots. - Default: --topn N genes with the highest and the - lowest log2 fold change expression values. - --exgenes [EXGENES ...] - Genes to be excluded from the differential expression - analysis. Default: include all genes - --topn TOPN Show N genes with the highest and N genes with the - lowest log2 fold change expression values. Ignored - with --genes. Default: 10 - --minlogfc MINLOGFC Include only those genes that on average have the - absolute value of log2 fold change expression - difference not lower than this value. Increasing - --minlogfc speeds up calculations, but can cause - missing weaker signals. Ignored with --pseudo. - Default: 0.25 - --minpct MINPCT Include only those genes that are detected in not - lower than this fraction of cells in either of the two - tested groups. Increasing --minpct speeds up - calculations by not testing genes that are very - infrequently expressed. Ignored with --pseudo. - Default: 0.1 - --maxpvadj MAXPVADJ Include only those genes for which adjusted P-val is - not bigger that this value. Default: 0.1 - --testuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2} - Statistical test to use for differential gene - expression analysis. Ignored with --pseudo. Default: - wilcox - --pseudo Aggregate gene expression of the cells from the same - dataset into a pseudobulk RNA-Seq sample before - running differential expression analysis with DESeq2. - The following parameters will be ignored: --testuse, - --minpct, --minlogfc. Default: false - --lrt Use LRT instead of the pair-wise Wald test. Shows any - differences across the variable set with --batchby - whith the log2 fold changes calculated as the average - expression changes due to criteria set with --splitby. - Ignored when --pseudo or --batchby parameters are not - provided. Default: use Wald test - --pdf Export plots in PDF. Default: false - --output OUTPUT Output prefix. Default: ./seurat - --threads THREADS Threads. Default: 1 \ No newline at end of file diff --git a/tools/seurat-cluster.cwl b/tools/seurat-cluster.cwl deleted file mode 100644 index 9aeef7ab..00000000 --- a/tools/seurat-cluster.cwl +++ /dev/null @@ -1,1349 +0,0 @@ -cwlVersion: v1.0 -class: CommandLineTool - - -requirements: -- class: InlineJavascriptRequirement -- class: InitialWorkDirRequirement - listing: - - entryname: dummy_metadata.csv - entry: | - library_id - scRNA-Seq - - -hints: -- class: DockerRequirement - dockerPull: biowardrobe2/seurat:v0.0.15 - - -inputs: - - feature_bc_matrices_folder: - type: - - Directory - - type: array - items: Directory - inputBinding: - prefix: "--mex" - doc: | - Path to the folder with not normalized aggregated feature-barcode matrix - from Cell Ranger Aggregate in MEX format. If multiple locations provided - data is assumed to be not aggregated (outputs from multiple Cell Ranger - Count runs) and will be merged. - - aggregation_metadata: - type: File? - doc: | - Path to the metadata TSV/CSV file to set the datasets identities. - If --mex points to the Cell Ranger Aggregate outputs, the aggregation.csv - file can be used as well. If multiple locations were provided through --mex, - the file should include at least one column - 'library_id', and be sorted - based on the the order of locations provided in --mex. If metadata file was - not provided at all, the dummy_metadata.csv will be used instead assuming - that feature_bc_matrices_folder was a single file from not aggregated - expreriment. - - conditions_data: - type: File? - inputBinding: - prefix: "--condition" - doc: | - Path to the TSV/CSV file to define datasets grouping. First column - - 'library_id' with the values provided in the same order as in the - correspondent column of the --identity file, second column 'condition'. - Default: each dataset is assigned to a separate group. - - classifier_rds: - type: File? - inputBinding: - prefix: "--classifier" - doc: | - Path to the Garnett classifier RDS file for cell type prediction. - Default: skip cell type prediction. - - cell_cycle_data: - type: File? - inputBinding: - prefix: "--cellcycle" - doc: | - Path to the TSV/CSV file with cell cycle data. First column - 'phase', - second column 'gene_id'. Default: skip cell cycle score assignment. - - barcodes_data: - type: File? - inputBinding: - prefix: "--barcodes" - doc: | - Path to the headerless TSV/CSV file with the list of barcodes to select - cells of interest (one barcode per line). Prefilters input feature-barcode - matrix to include only selected cells. Default: use all cells. - - minimum_cells: - type: int? - inputBinding: - prefix: "--mincells" - doc: | - Include only features detected in at least this many cells. Applied to - aggregated feature-barcode matrix from Cell Ranger Aggregate. Ignored - when --mex points to the locations of multiple Cell Ranger Count runs. - Default: 5 - - minimum_features: - type: - - "null" - - int - - int[] - inputBinding: - prefix: "--minfeatures" - doc: | - Include cells where at least this many features are detected. If multiple - values provided each of them will be applied to the correspondent dataset - from the --mex input. - Default: 250 (applied to all datasets) - - maximum_features: - type: - - "null" - - int - - int[] - inputBinding: - prefix: "--maxfeatures" - doc: | - Include cells with the number of features not bigger than this value. If - multiple values provided each of them will be applied to the correspondent - dataset from the --mex input. - Default: 5000 (applied to all datasets) - - minimum_umis: - type: - - "null" - - int - - int[] - inputBinding: - prefix: "--minumi" - doc: | - Include cells where at least this many UMIs (transcripts) are detected. If - multiple values provided each of them will be applied to the correspondent - dataset from the --mex input. - Default: 500 (applied to all datasets) - - minimum_novelty_score: - type: - - "null" - - float - - float[] - inputBinding: - prefix: "--minnovelty" - doc: | - Include cells with the novelty score not lower than this value, calculated as - log10(genes)/log10(UMIs). If multiple values provided each of them will be - applied to the correspondent dataset from the --mex input. - Default: 0.8 (applied to all datasets) - - maximum_mito_perc: - type: float? - inputBinding: - prefix: "--maxmt" - doc: | - Include cells with the percentage of transcripts mapped to mitochondrial genes - not bigger than this value. - Default: 5 - - mito_pattern: - type: string? - inputBinding: - prefix: "--mitopattern" - doc: | - Regex pattern to identify mitochondrial genes. - Default: '^Mt-' - - selected_features: - type: - - "null" - - string - - string[] - inputBinding: - prefix: "--features" - doc: | - Features of interest to evaluate expression. - Default: None - - regress_cellcycle: - type: boolean? - inputBinding: - prefix: "--regresscellcycle" - doc: | - Regress cell cycle as a confounding source of variation. - Default: false - - regress_mito_perc: - type: boolean? - inputBinding: - prefix: "--regressmt" - doc: | - Regress mitochondrial genes expression as a confounding source of variation. - Default: false - - high_var_features_count: - type: int? - inputBinding: - prefix: "--highvarcount" - doc: | - Number of highly variable features to detect. Used for datasets integration, - scaling, and dimensional reduction. - Default: 3000 - - dimensionality: - type: int? - inputBinding: - prefix: "--ndim" - doc: | - Number of principal components to use in UMAP projection and clustering - (from 1 to 50). Use Elbow plot to adjust this parameter. - Default: 10 - - umap_spread: - type: float? - inputBinding: - prefix: "--spread" - doc: | - The effective scale of embedded points on UMAP. In combination with mindist - this determines how clustered/clumped the embedded points are. - Default: 1 - - umap_mindist: - type: float? - inputBinding: - prefix: "--mindist" - doc: | - Controls how tightly the embedding is allowed compress points together on UMAP. - Larger values ensure embedded points are moreevenly distributed, while smaller - values allow the algorithm to optimise more accurately with regard to local structure. - Sensible values are in the range 0.001 to 0.5. - Default: 0.3 - - umap_nneighbors: - type: int? - inputBinding: - prefix: "--nneighbors" - doc: | - Determines the number of neighboring points used in UMAP. Larger values will result - in more global structure being preserved at the loss of detailed local structure. - In general this parameter should often be in the range 5 to 50. - Default: 30 - - umap_metric: - type: - - "null" - - type: enum - symbols: - - "euclidean" - - "manhattan" - - "chebyshev" - - "minkowski" - - "canberra" - - "braycurtis" - - "mahalanobis" - - "wminkowski" - - "seuclidean" - - "cosine" - - "correlation" - - "haversine" - - "hamming" - - "jaccard" - - "dice" - - "russelrao" - - "kulsinski" - - "ll_dirichlet" - - "hellinger" - - "rogerstanimoto" - - "sokalmichener" - - "sokalsneath" - - "yule" - inputBinding: - prefix: "--umetric" - doc: | - The metric to use to compute distances in high dimensional space for UMAP. - Default: cosine - - umap_method: - type: - - "null" - - type: enum - symbols: - - "uwot" - - "uwot-learn" - - "umap-learn" - inputBinding: - prefix: "--umethod" - doc: | - UMAP implementation to run. - Default: uwot - - cluster_metric: - type: - - "null" - - type: enum - symbols: - - "euclidean" - - "cosine" - - "manhattan" - - "hamming" - inputBinding: - prefix: "--ametric" - doc: | - Distance metric used by the nearest neighbors algorithm when running clustering. - Default: cosine - - resolution: - type: - - "null" - - float - - float[] - inputBinding: - prefix: "--resolution" - doc: | - Clustering resolution. Can be set as an array. - Default: 0.4 0.6 0.8 1.0 1.4 - - minimum_logfc: - type: float? - inputBinding: - prefix: "--logfc" - doc: | - Include only those genes that on average have log fold change difference in - expression between every tested pair of clusters not lower than this value. - Default: 0.25 - - minimum_pct: - type: float? - inputBinding: - prefix: "--minpct" - doc: | - Include only those features that are detected in not lower than this fraction - of cells in either of the two tested clusters. - Default: 0.1 - - only_positive_markers: - type: boolean? - inputBinding: - prefix: "--onlypos" - doc: | - Return only positive markers when running gene markers identification. - Default: false - - no_sct: - type: boolean? - inputBinding: - prefix: "--nosct" - doc: | - Do not use SCTransform when running datasets integration. Use LogNormalize instead. - Default: false - - test_use: - type: - - "null" - - type: enum - symbols: - - "wilcox" - - "bimod" - - "roc" - - "t" - - "negbinom" - - "poisson" - - "LR" - - "MAST" - - "DESeq2" - inputBinding: - prefix: "--testuse" - doc: | - Statistical test to use for gene markers identification. - Default: wilcox - - species: - type: - - "null" - - type: enum - symbols: - - "hs" - - "mm" - - "none" - inputBinding: - prefix: "--species" - doc: | - Select species for gene name conversion when running cell type prediction - with Garnett classifier. - Default: do not convert gene names - - export_pdf_plots: - type: boolean? - inputBinding: - prefix: "--pdf" - doc: | - Export plots in PDF. - Default: false - - export_rds_data: - type: boolean? - inputBinding: - prefix: "--rds" - doc: | - Save Seurat data to RDS file. - Default: false - - output_prefix: - type: string? - inputBinding: - prefix: "--output" - doc: | - Output prefix. - Default: ./seurat - - threads: - type: int? - inputBinding: - prefix: "--threads" - doc: | - Threads number - Default: 1 - - -outputs: - - raw_cell_count_plot_png: - type: File? - outputBinding: - glob: "*_raw_cell_count.png" - doc: | - Number of cells per dataset (not filtered). - PNG format - - raw_cell_count_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_cell_count.pdf" - doc: | - Number of cells per dataset (not filtered). - PDF format - - raw_umi_dnst_spl_by_cond_plot_png: - type: File? - outputBinding: - glob: "*_raw_umi_dnst_spl_by_cond.png" - doc: | - Split by condition UMI density per cell (not filtered). - PNG format - - raw_umi_dnst_spl_by_cond_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_umi_dnst_spl_by_cond.pdf" - doc: | - Split by condition UMI density per cell (not filtered). - PDF format - - raw_gene_dnst_spl_by_cond_plot_png: - type: File? - outputBinding: - glob: "*_raw_gene_dnst_spl_by_cond.png" - doc: | - Split by condition gene density per cell (not filtered). - PNG format - - raw_gene_dnst_spl_by_cond_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_gene_dnst_spl_by_cond.pdf" - doc: | - Split by condition gene density per cell (not filtered). - PDF format - - raw_gene_umi_corr_spl_by_ident_plot_png: - type: File? - outputBinding: - glob: "*_raw_gene_umi_corr_spl_by_ident.png" - doc: | - Split by identity genes vs UMIs per cell correlation (not filtered). - PNG format - - raw_gene_umi_corr_spl_by_ident_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_gene_umi_corr_spl_by_ident.pdf" - doc: | - Split by identity genes vs UMIs per cell correlation (not filtered). - PDF format - - raw_mito_perc_dnst_spl_by_cond_plot_png: - type: File? - outputBinding: - glob: "*_raw_mito_perc_dnst_spl_by_cond.png" - doc: | - Split by condition density of transcripts mapped to mitochondrial genes per cell (not filtered). - PNG format - - raw_mito_perc_dnst_spl_by_cond_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_mito_perc_dnst_spl_by_cond.pdf" - doc: | - Split by condition density of transcripts mapped to mitochondrial genes per cell (not filtered). - PDF format - - raw_nvlt_score_dnst_spl_by_cond_plot_png: - type: File? - outputBinding: - glob: "*_raw_nvlt_score_dnst_spl_by_cond.png" - doc: | - Split by condition novelty score density per cell (not filtered). - PNG format - - raw_nvlt_score_dnst_spl_by_cond_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_nvlt_score_dnst_spl_by_cond.pdf" - doc: | - Split by condition novelty score density per cell (not filtered). - PDF format - - raw_qc_mtrcs_plot_png: - type: File? - outputBinding: - glob: "*_raw_qc_mtrcs.png" - doc: | - QC metrics densities per cell (not filtered). - PNG format - - raw_qc_mtrcs_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_qc_mtrcs.pdf" - doc: | - QC metrics densities per cell (not filtered). - PDF format - - raw_qc_mtrcs_gr_by_cond_plot_png: - type: File? - outputBinding: - glob: "*_raw_qc_mtrcs_gr_by_cond.png" - doc: | - Grouped by condition QC metrics densities per cell (not filtered). - PNG format - - raw_qc_mtrcs_gr_by_cond_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_qc_mtrcs_gr_by_cond.pdf" - doc: | - Grouped by condition QC metrics densities per cell (not filtered). - PDF format - - - fltr_cell_count_plot_png: - type: File? - outputBinding: - glob: "*_fltr_cell_count.png" - doc: | - Number of cells per dataset (filtered). - PNG format - - fltr_cell_count_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_cell_count.pdf" - doc: | - Number of cells per dataset (filtered). - PDF format - - fltr_umi_dnst_spl_by_cond_plot_png: - type: File? - outputBinding: - glob: "*_fltr_umi_dnst_spl_by_cond.png" - doc: | - Split by condition UMI density per cell (filtered). - PNG format - - fltr_umi_dnst_spl_by_cond_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_umi_dnst_spl_by_cond.pdf" - doc: | - Split by condition UMI density per cell (filtered). - PDF format - - fltr_gene_dnst_spl_by_cond_plot_png: - type: File? - outputBinding: - glob: "*_fltr_gene_dnst_spl_by_cond.png" - doc: | - Split by condition gene density per cell (filtered). - PNG format - - fltr_gene_dnst_spl_by_cond_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_gene_dnst_spl_by_cond.pdf" - doc: | - Split by condition gene density per cell (filtered). - PDF format - - fltr_gene_umi_corr_spl_by_ident_plot_png: - type: File? - outputBinding: - glob: "*_fltr_gene_umi_corr_spl_by_ident.png" - doc: | - Split by identity genes vs UMIs per cell correlation (filtered). - PNG format - - fltr_gene_umi_corr_spl_by_ident_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_gene_umi_corr_spl_by_ident.pdf" - doc: | - Split by identity genes vs UMIs per cell correlation (filtered). - PDF format - - fltr_mito_perc_dnst_spl_by_cond_plot_png: - type: File? - outputBinding: - glob: "*_fltr_mito_perc_dnst_spl_by_cond.png" - doc: | - Split by condition density of transcripts mapped to mitochondrial genes per cell (filtered). - PNG format - - fltr_mito_perc_dnst_spl_by_cond_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_mito_perc_dnst_spl_by_cond.pdf" - doc: | - Split by condition density of transcripts mapped to mitochondrial genes per cell (filtered). - PDF format - - fltr_nvlt_score_dnst_spl_by_cond_plot_png: - type: File? - outputBinding: - glob: "*_fltr_nvlt_score_dnst_spl_by_cond.png" - doc: | - Split by condition novelty score density per cell (filtered). - PNG format - - fltr_nvlt_score_dnst_spl_by_cond_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_nvlt_score_dnst_spl_by_cond.pdf" - doc: | - Split by condition novelty score density per cell (filtered). - PDF format - - fltr_qc_mtrcs_plot_png: - type: File? - outputBinding: - glob: "*_fltr_qc_mtrcs.png" - doc: | - QC metrics densities per cell (filtered). - PNG format - - fltr_qc_mtrcs_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_qc_mtrcs.pdf" - doc: | - QC metrics densities per cell (filtered). - PDF format - - fltr_qc_mtrcs_gr_by_cond_plot_png: - type: File? - outputBinding: - glob: "*_fltr_qc_mtrcs_gr_by_cond.png" - doc: | - Grouped by condition QC metrics densities per cell (filtered). - PNG format - - fltr_qc_mtrcs_gr_by_cond_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_qc_mtrcs_gr_by_cond.pdf" - doc: | - Grouped by condition QC metrics densities per cell (filtered). - PDF format - - - fltr_pca_spl_by_ph_plot_png: - type: File? - outputBinding: - glob: "*_fltr_pca_spl_by_ph.png" - doc: | - Split by cell cycle phase PCA of filtered unintegrated/scaled datasets. - PNG format - - fltr_pca_spl_by_ph_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_pca_spl_by_ph.pdf" - doc: | - Split by cell cycle phase PCA of filtered unintegrated/scaled datasets. - PDF format - - fltr_pca_spl_by_mito_perc_plot_png: - type: File? - outputBinding: - glob: "*_fltr_pca_spl_by_mito_perc.png" - doc: | - Split by level of transcripts mapped to mitochondrial genes PCA of filtered unintegrated/scaled datasets. - PNG format - - fltr_pca_spl_by_mito_perc_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_pca_spl_by_mito_perc.pdf" - doc: | - Split by level of transcripts mapped to mitochondrial genes PCA of filtered unintegrated/scaled datasets. - PDF format - - fltr_umap_spl_by_idnt_plot_png: - type: File? - outputBinding: - glob: "*_fltr_umap_spl_by_idnt.png" - doc: | - Split by identity UMAP projected PCA of filtered unintegrated/scaled datasets. - PNG format - - fltr_umap_spl_by_idnt_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_umap_spl_by_idnt.pdf" - doc: | - Split by identity UMAP projected PCA of filtered unintegrated/scaled datasets. - PDF format - - - ntgr_elbow_plot_png: - type: File? - outputBinding: - glob: "*_ntgr_elbow.png" - doc: | - Elbow plot from PCA of filtered integrated/scaled datasets. - PNG format - - ntgr_elbow_plot_pdf: - type: File? - outputBinding: - glob: "*_ntgr_elbow.pdf" - doc: | - Elbow plot from PCA of filtered integrated/scaled datasets. - PDF format - - ntgr_pca_plot_png: - type: File? - outputBinding: - glob: "*_ntgr_pca.png" - doc: | - PCA of filtered integrated/scaled datasets. - PNG format - - ntgr_pca_plot_pdf: - type: File? - outputBinding: - glob: "*_ntgr_pca.pdf" - doc: | - PCA of filtered integrated/scaled datasets. - PDF format - - ntgr_pca_heatmap_png: - type: File? - outputBinding: - glob: "*_ntgr_pca_heatmap.png" - doc: | - Genes per cells expression heatmap sorted by their PC scores from PCA of filtered integrated/scaled datasets. - PNG format - - ntgr_pca_heatmap_pdf: - type: File? - outputBinding: - glob: "*_ntgr_pca_heatmap.pdf" - doc: | - Genes per cells expression heatmap sorted by their PC scores from PCA of filtered integrated/scaled datasets. - PDF format - - ntgr_pca_loadings_plot_png: - type: File? - outputBinding: - glob: "*_ntgr_pca_loadings.png" - doc: | - PC scores of the most variant genes from PCA of filtered integrated/scaled datasets. - PNG format - - ntgr_pca_loadings_plot_pdf: - type: File? - outputBinding: - glob: "*_ntgr_pca_loadings.pdf" - doc: | - PC scores of the most variant genes from PCA of filtered integrated/scaled datasets. - PDF format - - ntgr_umap_spl_by_idnt_plot_png: - type: File? - outputBinding: - glob: "*_ntgr_umap_spl_by_idnt.png" - doc: | - Split by identity UMAP projected PCA of filtered integrated/scaled datasets. - PNG format - - ntgr_umap_spl_by_idnt_plot_pdf: - type: File? - outputBinding: - glob: "*_ntgr_umap_spl_by_idnt.pdf" - doc: | - Split by identity UMAP projected PCA of filtered integrated/scaled datasets. - PDF format - - - clst_umap_res_plot_png: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_clst_umap_res_*.png" - doc: | - Clustered UMAP projected PCA of filtered integrated/scaled datasets. - PNG format - - clst_umap_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_clst_umap_res_*.pdf" - doc: | - Clustered UMAP projected PCA of filtered integrated/scaled datasets. - PDF format - - clst_umap_spl_by_cond_res_plot_png: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_clst_umap_spl_by_cond_res_*.png" - doc: | - Split by condition clustered UMAP projected PCA of filtered integrated/scaled datasets. - PNG format - - clst_umap_spl_by_cond_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_clst_umap_spl_by_cond_res_*.pdf" - doc: | - Split by condition clustered UMAP projected PCA of filtered integrated/scaled datasets. - PDF format - - clst_umap_ctype_res_plot_png: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_clst_umap_ctype_res_*.png" - doc: | - Grouped by predicted cell types UMAP projected PCA of filtered integrated/scaled datasets. - PNG format - - clst_umap_ctype_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_clst_umap_ctype_res_*.pdf" - doc: | - Grouped by predicted cell types UMAP projected PCA of filtered integrated/scaled datasets. - PDF format - - clst_umap_spl_by_ph_res_plot_png: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_clst_umap_spl_by_ph_res_*.png" - doc: | - Split by cell cycle phase clustered UMAP projected PCA of filtered integrated/scaled datasets. - PNG format - - clst_umap_spl_by_ph_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_clst_umap_spl_by_ph_res_*.pdf" - doc: | - Split by cell cycle phase clustered UMAP projected PCA of filtered integrated/scaled datasets. - PDF format - - clst_qc_mtrcs_res_plot_png: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_clst_qc_mtrcs_res_*.png" - doc: | - QC metrics for clustered UMAP projected PCA of filtered integrated/scaled datasets. - PNG format - - clst_qc_mtrcs_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_clst_qc_mtrcs_res_*.pdf" - doc: | - QC metrics for clustered UMAP projected PCA of filtered integrated/scaled datasets. - PDF format - - expr_avg_per_clst_res_plot_png: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_expr_avg_per_clst_res_*.png" - doc: | - Scaled average log normalized gene expression per cluster of filtered integrated/scaled datasets. - PNG format - - expr_avg_per_clst_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_expr_avg_per_clst_res_*.pdf" - doc: | - Scaled average log normalized gene expression per cluster of filtered integrated/scaled datasets. - PDF format - - expr_per_clst_cell_res_plot_png: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_expr_per_clst_cell_res_*.png" - doc: | - Log normalized gene expression per cell of clustered filtered integrated/scaled datasets. - PNG format - - expr_per_clst_cell_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_expr_per_clst_cell_res_*.pdf" - doc: | - Log normalized gene expression per cell of clustered filtered integrated/scaled datasets. - PDF format - - expr_clst_heatmap_res_plot_png: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_expr_clst_heatmap_res_*.png" - doc: | - Log normalized gene expression heatmap of clustered filtered integrated/scaled datasets. - PNG format - - expr_clst_heatmap_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_expr_clst_heatmap_res_*.pdf" - doc: | - Log normalized gene expression heatmap of clustered filtered integrated/scaled datasets. - PDF format - - expr_dnst_per_clst_res_plot_png: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_expr_dnst_per_clst_res_*.png" - doc: | - Log normalized gene expression densities per cluster of filtered integrated/scaled datasets. - PNG format - - expr_dnst_per_clst_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_expr_dnst_per_clst_res_*.pdf" - doc: | - Log normalized gene expression densities per cluster of filtered integrated/scaled datasets. - PDF format - - expr_avg_per_ctype_res_plot_png: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_expr_avg_per_ctype_res_*.png" - doc: | - Scaled average log normalized gene expression per predicted cell type of filtered integrated/scaled datasets. - PNG format - - expr_avg_per_ctype_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_expr_avg_per_ctype_res_*.pdf" - doc: | - Scaled average log normalized gene expression per predicted cell type of filtered integrated/scaled datasets. - PDF format - - expr_per_ctype_cell_res_plot_png: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_expr_per_ctype_cell_res_*.png" - doc: | - Log normalized gene expression per cell of clustered filtered integrated/scaled datasets with predicted cell types. - PNG format - - expr_per_ctype_cell_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_expr_per_ctype_cell_res_*.pdf" - doc: | - Log normalized gene expression per cell of clustered filtered integrated/scaled datasets with predicted cell types. - PDF format - - expr_ctype_heatmap_res_plot_png: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_expr_ctype_heatmap_res_*.png" - doc: | - Log normalized gene expression heatmap of clustered filtered integrated/scaled datasets with predicted cell types. - PNG format - - expr_ctype_heatmap_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_expr_ctype_heatmap_res_*.pdf" - doc: | - Log normalized gene expression heatmap of clustered filtered integrated/scaled datasets with predicted cell types. - PDF format - - expr_dnst_per_ctype_res_plot_png: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_expr_dnst_per_ctype_res_*.png" - doc: | - Log normalized gene expression densities per predicted cell type of filtered integrated/scaled datasets. - PNG format - - expr_dnst_per_ctype_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_expr_dnst_per_ctype_res_*.pdf" - doc: | - Log normalized gene expression densities per predicted cell type of filtered integrated/scaled datasets. - PDF format - - clst_pttv_gene_markers: - type: File - outputBinding: - glob: "*_clst_pttv_gene_markers.tsv" - doc: | - Putative gene markers file for all clusters and all resolutions. - TSV format - - clst_csrvd_gene_markers: - type: File - outputBinding: - glob: "*_clst_csrvd_gene_markers.tsv" - doc: | - Conserved gene markers file for all clusters and all resolutions. - TSV format - - seurat_clst_data_rds: - type: File? - outputBinding: - glob: "*_clst_data.rds" - doc: | - Clustered filtered integrated/scaled Seurat data. - RDS format - - cellbrowser_config_data: - type: Directory - outputBinding: - glob: "*_cellbrowser" - doc: | - Directory with UCSC Cellbrowser configuration data - - cellbrowser_html_data: - type: Directory - outputBinding: - glob: "*_cellbrowser/html_data" - doc: | - Directory with UCSC Cellbrowser formatted html data - - cellbrowser_html_file: - type: File - outputBinding: - glob: "*_cellbrowser/html_data/index.html" - doc: | - HTML index file from the directory with UCSC Cellbrowser formatted html data - - - stdout_log: - type: stdout - - stderr_log: - type: stderr - - -baseCommand: ["run_seurat.R"] -arguments: -- valueFrom: | - ${ - if (inputs.aggregation_metadata) { - return inputs.aggregation_metadata; - } else { - return runtime.outdir + "/dummy_metadata.csv" - } - } - prefix: "--identity" - - -stdout: seurat_cluster_stdout.log -stderr: seurat_cluster_stderr.log - - -$namespaces: - s: http://schema.org/ - -$schemas: -- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf - - -label: "Seurat cluster" -s:name: "Seurat cluster" -s:alternateName: "Runs Seurat for comparative scRNA-seq analysis of across experimental conditions" - -s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/seurat-cluster.cwl -s:codeRepository: https://github.com/Barski-lab/workflows -s:license: http://www.apache.org/licenses/LICENSE-2.0 - -s:isPartOf: - class: s:CreativeWork - s:name: Common Workflow Language - s:url: http://commonwl.org/ - -s:creator: -- class: s:Organization - s:legalName: "Cincinnati Children's Hospital Medical Center" - s:location: - - class: s:PostalAddress - s:addressCountry: "USA" - s:addressLocality: "Cincinnati" - s:addressRegion: "OH" - s:postalCode: "45229" - s:streetAddress: "3333 Burnet Ave" - s:telephone: "+1(513)636-4200" - s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" - s:department: - - class: s:Organization - s:legalName: "Allergy and Immunology" - s:department: - - class: s:Organization - s:legalName: "Barski Research Lab" - s:member: - - class: s:Person - s:name: Michael Kotliar - s:email: mailto:misha.kotliar@gmail.com - s:sameAs: - - id: http://orcid.org/0000-0002-6486-3898 - - -doc: | - Seurat cluster - ============== - - The joint analysis of multiple scRNA-Seq datasets with [Seurat](https://satijalab.org/seurat/) starts with evaluation of common - single-cell quality control (QC) metrics – genes and UMIs counts, percentage of mitochondrial genes - expressed. QC allows to get a general overview of the datasets quality as well as to define filtering - thresholds for dead or low-quality cells removal. Filtered merged datasets are then being processed - with the integration algorithm. Its main goal is to identify integration anchors – pairs of cells that can - “pull together” the same cell type populations from the different datasets. An integration algorithm - can also solve batch correction problem by regressing out the unwanted sources of variation. The - integrated data then undergo the dimensionality reduction processing that starts from the principal - component analysis (PCA). Based on the PCA results the uniform manifold approximation and - projection (UMAP) and clustering analysis are run with the principal components of the highest - variance. Clustered data are then used for gene markers identification. These genes are differentially - expressed between clusters and can be used for cell types assignment. - More details about scRNA-Seq integration analysis with Seurat can be found in the official - [documentation](https://satijalab.org/seurat/articles/integration_introduction.html). - - -s:about: | - usage: run_seurat.R [-h] --mex MEX [MEX ...] --identity - IDENTITY [--condition CONDITION] - [--classifier CLASSIFIER] - [--cellcycle CELLCYCLE] - [--barcodes BARCODES] [--mincells MINCELLS] - [--minfeatures [MINFEATURES [MINFEATURES ...]]] - [--maxfeatures [MAXFEATURES [MAXFEATURES ...]]] - [--minumi [MINUMI [MINUMI ...]]] - [--minnovelty [MINNOVELTY [MINNOVELTY ...]]] - [--maxmt MAXMT] [--mitopattern MITOPATTERN] - [--features [FEATURES [FEATURES ...]]] - [--regresscellcycle] [--regressmt] - [--highvarcount HIGHVARCOUNT] [--ndim NDIM] - [--resolution [RESOLUTION [RESOLUTION ...]]] - [--logfc LOGFC] [--minpct MINPCT] - [--onlypos] - [--testuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] - [--species {hs,mm,none}] [--pdf] [--rds] - [--output OUTPUT] [--threads THREADS] - - Runs Seurat for comparative scRNA-seq analysis of across experimental - conditions - - optional arguments: - -h, --help show this help message and exit - --mex MEX [MEX ...] Path to the folder with not normalized aggregated - feature-barcode matrix from Cell Ranger Aggregate in - MEX format. If multiple locations provided data is - assumed to be not aggregated (outputs from multiple - Cell Ranger Count runs) and will be merged. - --identity IDENTITY Path to the metadata TSV/CSV file to set the datasets - identities. If --mex points to the Cell Ranger - Aggregate outputs, the aggregation.csv file can be - used as well. If multiple locations were provided - through --mex, the file should include at least one - column - 'library_id', and be sorted based on the the - order of locations provided in --mex. - --condition CONDITION - Path to the TSV/CSV file to define datasets grouping. - First column - 'library_id' with the values provided - in the correspondent column of the --identity file, - second column 'condition'. Default: each dataset is - assigned to a separate group. - --classifier CLASSIFIER - Path to the Garnett classifier RDS file for cell type - prediction. Default: skip cell type prediction. - --cellcycle CELLCYCLE - Path to the TSV/CSV file with cell cycle data. First - column - 'phase', second column 'gene_id'. Default: - skip cell cycle score assignment. - --barcodes BARCODES Path to the headerless TSV/CSV file with the list of - barcodes to select cells of interest (one barcode per - line). Prefilters input feature-barcode matrix to - include only selected cells. Default: use all cells. - --mincells MINCELLS Include only features detected in at least this many - cells. Applied to aggregated feature-barcode matrix - from Cell Ranger Aggregate. Ignored when --mex points - to the locations of multiple Cell Ranger Count runs. - Default: 5 - --minfeatures [MINFEATURES [MINFEATURES ...]] - Include cells where at least this many features are - detected. If multiple values provided each of them - will be applied to the correspondent dataset from the - --mex input. Default: 250 (applied to all datasets) - --maxfeatures [MAXFEATURES [MAXFEATURES ...]] - Include cells with the number of features not bigger - than this value. If multiple values provided each of - them will be applied to the correspondent dataset from - the --mex input. Default: 5000 (applied to all - datasets) - --minumi [MINUMI [MINUMI ...]] - Include cells where at least this many UMIs - (transcripts) are detected. If multiple values - provided each of them will be applied to the - correspondent dataset from the --mex input. Default: - 500 (applied to all datasets) - --minnovelty [MINNOVELTY [MINNOVELTY ...]] - Include cells with the novelty score not lower than - this value, calculated as log10(genes)/log10(UMIs). If - multiple values provided each of them will be applied - to the correspondent dataset from the --mex input. - Default: 0.8 (applied to all datasets) - --maxmt MAXMT Include cells with the percentage of transcripts - mapped to mitochondrial genes not bigger than this - value. Default: 5 - --mitopattern MITOPATTERN - Regex pattern to identify mitochondrial genes. - Default: '^Mt-' - --features [FEATURES [FEATURES ...]] - Features of interest to evaluate expression. Default: - None - --regresscellcycle Regress cell cycle as a confounding source of - variation. Default: false - --regressmt Regress mitochondrial genes expression as a - confounding source of variation. Default: false - --highvarcount HIGHVARCOUNT - Number of highly variable features to detect. Used for - datasets integration, scaling, and dimensional - reduction. Default: 3000 - --ndim NDIM Number of principal components to use in UMAP - projection and clustering (from 1 to 50). Use Elbow - plot to adjust this parameter. Default: 10 - --resolution [RESOLUTION [RESOLUTION ...]] - Clustering resolution. Can be set as an array. - Default: 0.4 0.6 0.8 1.0 1.4 - --logfc LOGFC Include only those genes that on average have log fold - change difference in expression between every tested - pair of clusters not lower than this value. Default: - 0.25 - --minpct MINPCT Include only those features that are detected in not - lower than this fraction of cells in either of the two - tested clusters. Default: 0.1 - --onlypos Return only positive markers when running gene markers - identification. Default: false - --testuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2} - Statistical test to use for gene markers - identification. Default: wilcox - --species {hs,mm,none} - Select species for gene name conversion when running - cell type prediction with Garnett classifier. Default: - do not convert gene names - --pdf Export plots in PDF. Default: false - --rds Save Seurat data to RDS file. Default: false - --output OUTPUT Output prefix. Default: ./seurat - --threads THREADS Threads. Default: 1 From 717e1abee7180a9fd653d6a009167dbd999399e3 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Sat, 27 Jan 2024 19:21:18 -0500 Subject: [PATCH 114/162] Put back the original rename.cwl. No need to cause the update of so many workflows --- tools/rename.cwl | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tools/rename.cwl b/tools/rename.cwl index fe8397e7..52d10ae4 100644 --- a/tools/rename.cwl +++ b/tools/rename.cwl @@ -21,17 +21,15 @@ inputs: type: string? default: | #!/bin/bash - if [ -f $0 ]; then - cp $0 $1 - if [ -f $0.bai ]; then - cp $0.bai $1.bai - fi + cp $0 $1 + if [ -f $0.bai ]; then + cp $0.bai $1.bai fi inputBinding: position: 1 source_file: - type: File? + type: File inputBinding: position: 5 @@ -45,12 +43,12 @@ inputs: outputs: target_file: - type: File? + type: File outputBinding: glob: $(get_target_name()) secondaryFiles: | ${ - if (inputs.source_file && inputs.source_file.secondaryFiles && inputs.source_file.secondaryFiles.length > 0){ + if (inputs.source_file.secondaryFiles && inputs.source_file.secondaryFiles.length > 0){ return inputs.target_filename+".bai"; } else { return "null"; From 7a5f153bf8e5827aff9bfe254943e6c41785e585 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Sat, 27 Jan 2024 19:23:38 -0500 Subject: [PATCH 115/162] Put back original bam-bedgraph-bigwig - causes to many workflows updates --- tools/bam-bedgraph-bigwig.cwl | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/bam-bedgraph-bigwig.cwl b/tools/bam-bedgraph-bigwig.cwl index 62fe3a1b..a0888bda 100644 --- a/tools/bam-bedgraph-bigwig.cwl +++ b/tools/bam-bedgraph-bigwig.cwl @@ -93,7 +93,6 @@ steps: pairchip: pairchip fragment_size: fragment_size scale: scale - chrom_length_file: chrom_length_file mapped_reads_number: mapped_reads_number strand: strand du: dutp From 4079877da7193e22898134326862e8ace5945c24 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Sat, 27 Jan 2024 19:48:30 -0500 Subject: [PATCH 116/162] Put back deprecated upstreams - not smart to delete them --- workflows/bedtools-multicov.cwl | 6 ++++++ workflows/deseq-lrt.cwl | 6 ++++++ workflows/deseq-multi-factor.cwl | 4 ++++ workflows/deseq.cwl | 12 ++++++++++++ workflows/diffbind-multi-factor.cwl | 2 ++ workflows/diffbind.cwl | 6 ++++++ workflows/feature-merge.cwl | 6 ++++++ workflows/filter-peaks-for-heatmap.cwl | 2 ++ workflows/genelists-deseq-diffbind.cwl | 8 ++++++++ workflows/heatmap.cwl | 2 ++ workflows/homer-motif-analysis-peak.cwl | 4 ++++ workflows/hopach.cwl | 6 ++++++ workflows/manorm-pe.cwl | 2 ++ workflows/manorm-se.cwl | 2 ++ workflows/pca.cwl | 6 ++++++ workflows/rgt-thor.cwl | 4 ++++ workflows/super-enhancer.cwl | 4 ++++ workflows/trim-chipseq-pe-cut-n-run.cwl | 1 + 18 files changed, 83 insertions(+) diff --git a/workflows/bedtools-multicov.cwl b/workflows/bedtools-multicov.cwl index 9e3aa742..8838c3f6 100644 --- a/workflows/bedtools-multicov.cwl +++ b/workflows/bedtools-multicov.cwl @@ -11,10 +11,16 @@ requirements: 'sd:upstream': sample: + - "chipseq-se.cwl" + - "chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-se.cwl" - "trim-atacseq-pe.cwl" + - "rnaseq-se.cwl" + - "rnaseq-pe.cwl" + - "rnaseq-se-dutp.cwl" + - "rnaseq-pe-dutp.cwl" - "trim-rnaseq-pe.cwl" - "trim-rnaseq-se.cwl" - "trim-rnaseq-pe-dutp.cwl" diff --git a/workflows/deseq-lrt.cwl b/workflows/deseq-lrt.cwl index 58a74295..5311e347 100644 --- a/workflows/deseq-lrt.cwl +++ b/workflows/deseq-lrt.cwl @@ -11,6 +11,12 @@ requirements: 'sd:upstream': rnaseq_experiment: + - "rnaseq-se.cwl" + - "rnaseq-pe.cwl" + - "rnaseq-se-dutp.cwl" + - "rnaseq-pe-dutp.cwl" + - "rnaseq-se-dutp-mitochondrial.cwl" + - "rnaseq-pe-dutp-mitochondrial.cwl" - "trim-rnaseq-pe.cwl" - "trim-rnaseq-se.cwl" - "trim-rnaseq-pe-dutp.cwl" diff --git a/workflows/deseq-multi-factor.cwl b/workflows/deseq-multi-factor.cwl index 01e6395a..dc952991 100644 --- a/workflows/deseq-multi-factor.cwl +++ b/workflows/deseq-multi-factor.cwl @@ -19,6 +19,10 @@ requirements: 'sd:upstream': rnaseq_experiment: + - "rnaseq-se.cwl" + - "rnaseq-pe.cwl" + - "rnaseq-se-dutp.cwl" + - "rnaseq-pe-dutp.cwl" - "trim-rnaseq-pe.cwl" - "trim-rnaseq-se.cwl" - "trim-rnaseq-pe-dutp.cwl" diff --git a/workflows/deseq.cwl b/workflows/deseq.cwl index 2b6580c8..5af6a872 100644 --- a/workflows/deseq.cwl +++ b/workflows/deseq.cwl @@ -12,6 +12,12 @@ requirements: 'sd:upstream': rnaseq_cond_1: - "mirna-mirdeep2-se.cwl" + - "rnaseq-se.cwl" + - "rnaseq-pe.cwl" + - "rnaseq-se-dutp.cwl" + - "rnaseq-pe-dutp.cwl" + - "rnaseq-se-dutp-mitochondrial.cwl" + - "rnaseq-pe-dutp-mitochondrial.cwl" - "trim-rnaseq-pe.cwl" - "trim-rnaseq-se.cwl" - "trim-rnaseq-pe-dutp.cwl" @@ -22,6 +28,12 @@ requirements: - "trim-rnaseq-pe-ercc.cwl" rnaseq_cond_2: - "mirna-mirdeep2-se.cwl" + - "rnaseq-se.cwl" + - "rnaseq-pe.cwl" + - "rnaseq-se-dutp.cwl" + - "rnaseq-pe-dutp.cwl" + - "rnaseq-se-dutp-mitochondrial.cwl" + - "rnaseq-pe-dutp-mitochondrial.cwl" - "trim-rnaseq-pe.cwl" - "trim-rnaseq-se.cwl" - "trim-rnaseq-pe-dutp.cwl" diff --git a/workflows/diffbind-multi-factor.cwl b/workflows/diffbind-multi-factor.cwl index a46ab920..5716b69d 100644 --- a/workflows/diffbind-multi-factor.cwl +++ b/workflows/diffbind-multi-factor.cwl @@ -18,6 +18,8 @@ requirements: 'sd:upstream': dna_experiment: + - "chipseq-se.cwl" + - "chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-se.cwl" diff --git a/workflows/diffbind.cwl b/workflows/diffbind.cwl index 0ff0a6b5..ded5bbcd 100644 --- a/workflows/diffbind.cwl +++ b/workflows/diffbind.cwl @@ -10,6 +10,8 @@ requirements: 'sd:upstream': first_biological_condition: + - "chipseq-se.cwl" + - "chipseq-pe.cwl" - "cutandrun-macs2-pe.cwl" - "cutandrun-seacr-pe.cwl" - "trim-chipseq-se.cwl" @@ -17,6 +19,8 @@ requirements: - "trim-atacseq-se.cwl" - "trim-atacseq-pe.cwl" second_biological_condition: + - "chipseq-se.cwl" + - "chipseq-pe.cwl" - "cutandrun-macs2-pe.cwl" - "cutandrun-seacr-pe.cwl" - "trim-chipseq-se.cwl" @@ -24,6 +28,8 @@ requirements: - "trim-atacseq-se.cwl" - "trim-atacseq-pe.cwl" blocked_condition: + - "chipseq-se.cwl" + - "chipseq-pe.cwl" - "cutandrun-macs2-pe.cwl" - "cutandrun-seacr-pe.cwl" - "trim-chipseq-se.cwl" diff --git a/workflows/feature-merge.cwl b/workflows/feature-merge.cwl index 631fe6de..999b49c9 100644 --- a/workflows/feature-merge.cwl +++ b/workflows/feature-merge.cwl @@ -11,6 +11,12 @@ requirements: 'sd:upstream': rnaseq_sample: + - "rnaseq-se.cwl" + - "rnaseq-pe.cwl" + - "rnaseq-se-dutp.cwl" + - "rnaseq-pe-dutp.cwl" + - "rnaseq-se-dutp-mitochondrial.cwl" + - "rnaseq-pe-dutp-mitochondrial.cwl" - "trim-rnaseq-pe.cwl" - "trim-rnaseq-se.cwl" - "trim-rnaseq-pe-dutp.cwl" diff --git a/workflows/filter-peaks-for-heatmap.cwl b/workflows/filter-peaks-for-heatmap.cwl index 700f078d..6e4ead99 100644 --- a/workflows/filter-peaks-for-heatmap.cwl +++ b/workflows/filter-peaks-for-heatmap.cwl @@ -11,6 +11,8 @@ requirements: 'sd:upstream': sample_to_filter: + - "chipseq-se.cwl" + - "chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-se.cwl" diff --git a/workflows/genelists-deseq-diffbind.cwl b/workflows/genelists-deseq-diffbind.cwl index 5685b69c..8bdd20e6 100644 --- a/workflows/genelists-deseq-diffbind.cwl +++ b/workflows/genelists-deseq-diffbind.cwl @@ -16,6 +16,8 @@ requirements: - "filter-peaks-for-heatmap.cwl" - "genelists-sets.cwl" samples_nabinding: + - "chipseq-se.cwl" + - "chipseq-pe.cwl" - "cutandrun-macs2-pe.cwl" - "cutandrun-seacr-pe.cwl" - "trim-chipseq-se.cwl" @@ -23,6 +25,12 @@ requirements: - "trim-atacseq-se.cwl" - "trim-atacseq-pe.cwl" samples_rnaseq: + - "rnaseq-se.cwl" + - "rnaseq-pe.cwl" + - "rnaseq-se-dutp.cwl" + - "rnaseq-pe-dutp.cwl" + - "rnaseq-se-dutp-mitochondrial.cwl" + - "rnaseq-pe-dutp-mitochondrial.cwl" - "mirna-mirdeep2-se.cwl" - "trim-rnaseq-pe.cwl" - "trim-rnaseq-se.cwl" diff --git a/workflows/heatmap.cwl b/workflows/heatmap.cwl index f0a15aa9..2a1b11d6 100644 --- a/workflows/heatmap.cwl +++ b/workflows/heatmap.cwl @@ -14,6 +14,8 @@ requirements: 'sd:upstream': chipseq_sample: + - "chipseq-se.cwl" + - "chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-se.cwl" diff --git a/workflows/homer-motif-analysis-peak.cwl b/workflows/homer-motif-analysis-peak.cwl index 1ff7bf61..01768a51 100644 --- a/workflows/homer-motif-analysis-peak.cwl +++ b/workflows/homer-motif-analysis-peak.cwl @@ -12,11 +12,15 @@ requirements: genome_indices: - "genome-indices.cwl" regions_a: + - "chipseq-se.cwl" + - "chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-se.cwl" - "trim-atacseq-pe.cwl" regions_b: + - "chipseq-se.cwl" + - "chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-se.cwl" diff --git a/workflows/hopach.cwl b/workflows/hopach.cwl index 6456446c..db294ccd 100644 --- a/workflows/hopach.cwl +++ b/workflows/hopach.cwl @@ -11,6 +11,12 @@ requirements: 'sd:upstream': rnaseq_sample: + - "rnaseq-se.cwl" + - "rnaseq-pe.cwl" + - "rnaseq-se-dutp.cwl" + - "rnaseq-pe-dutp.cwl" + - "rnaseq-se-dutp-mitochondrial.cwl" + - "rnaseq-pe-dutp-mitochondrial.cwl" - "trim-rnaseq-pe.cwl" - "trim-rnaseq-se.cwl" - "trim-rnaseq-pe-dutp.cwl" diff --git a/workflows/manorm-pe.cwl b/workflows/manorm-pe.cwl index 1b008f8b..9e968387 100644 --- a/workflows/manorm-pe.cwl +++ b/workflows/manorm-pe.cwl @@ -10,11 +10,13 @@ requirements: 'sd:upstream': first_chipseq_sample: + - "chipseq-pe.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-pe.cwl" - "cutandrun-macs2-pe.cwl" - "cutandrun-seacr-pe.cwl" second_chipseq_sample: + - "chipseq-pe.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-pe.cwl" - "cutandrun-macs2-pe.cwl" diff --git a/workflows/manorm-se.cwl b/workflows/manorm-se.cwl index 9cbc0336..79c7bf56 100644 --- a/workflows/manorm-se.cwl +++ b/workflows/manorm-se.cwl @@ -10,9 +10,11 @@ requirements: 'sd:upstream': first_chipseq_sample: + - "chipseq-se.cwl" - "trim-chipseq-se.cwl" - "trim-atacseq-se.cwl" second_chipseq_sample: + - "chipseq-se.cwl" - "trim-chipseq-se.cwl" - "trim-atacseq-se.cwl" diff --git a/workflows/pca.cwl b/workflows/pca.cwl index f9d46029..006c1372 100644 --- a/workflows/pca.cwl +++ b/workflows/pca.cwl @@ -11,6 +11,12 @@ requirements: 'sd:upstream': rnaseq_sample: + - "rnaseq-se.cwl" + - "rnaseq-pe.cwl" + - "rnaseq-se-dutp.cwl" + - "rnaseq-pe-dutp.cwl" + - "rnaseq-se-dutp-mitochondrial.cwl" + - "rnaseq-pe-dutp-mitochondrial.cwl" - "trim-rnaseq-pe.cwl" - "trim-rnaseq-se.cwl" - "trim-rnaseq-pe-dutp.cwl" diff --git a/workflows/rgt-thor.cwl b/workflows/rgt-thor.cwl index 7bcf6f04..04f607ec 100644 --- a/workflows/rgt-thor.cwl +++ b/workflows/rgt-thor.cwl @@ -10,11 +10,15 @@ requirements: 'sd:upstream': first_biological_condition: + - "chipseq-se.cwl" + - "chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-se.cwl" - "trim-atacseq-pe.cwl" second_biological_condition: + - "chipseq-se.cwl" + - "chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "trim-chipseq-pe.cwl" - "trim-atacseq-se.cwl" diff --git a/workflows/super-enhancer.cwl b/workflows/super-enhancer.cwl index 16c3f4d1..07dbadf1 100644 --- a/workflows/super-enhancer.cwl +++ b/workflows/super-enhancer.cwl @@ -10,10 +10,14 @@ requirements: 'sd:upstream': chipseq_sample: + - "chipseq-se.cwl" + - "chipseq-pe.cwl" - "trim-chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "cutandrun-pe.cwl" chipseq_control: + - "chipseq-se.cwl" + - "chipseq-pe.cwl" - "trim-chipseq-pe.cwl" - "trim-chipseq-se.cwl" - "cutandrun-pe.cwl" diff --git a/workflows/trim-chipseq-pe-cut-n-run.cwl b/workflows/trim-chipseq-pe-cut-n-run.cwl index a80fdac6..04908f44 100644 --- a/workflows/trim-chipseq-pe-cut-n-run.cwl +++ b/workflows/trim-chipseq-pe-cut-n-run.cwl @@ -380,6 +380,7 @@ s:creator: doc: | Experimental pipeline for Cut-n-Run analysis. Uses mapping results from the following experiment types: + - `chipseq-pe.cwl` - `trim-chipseq-pe.cwl` - `trim-atacseq-pe.cwl` From df947f0d2545dcf859f4a1730764ff464825f943 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Sat, 27 Jan 2024 19:51:12 -0500 Subject: [PATCH 117/162] Not important changes --- workflows/genelists-deseq-diffbind.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/genelists-deseq-diffbind.cwl b/workflows/genelists-deseq-diffbind.cwl index 8bdd20e6..bdd1c0f0 100644 --- a/workflows/genelists-deseq-diffbind.cwl +++ b/workflows/genelists-deseq-diffbind.cwl @@ -25,13 +25,13 @@ requirements: - "trim-atacseq-se.cwl" - "trim-atacseq-pe.cwl" samples_rnaseq: + - "mirna-mirdeep2-se.cwl" - "rnaseq-se.cwl" - "rnaseq-pe.cwl" - "rnaseq-se-dutp.cwl" - "rnaseq-pe-dutp.cwl" - "rnaseq-se-dutp-mitochondrial.cwl" - "rnaseq-pe-dutp-mitochondrial.cwl" - - "mirna-mirdeep2-se.cwl" - "trim-rnaseq-pe.cwl" - "trim-rnaseq-se.cwl" - "trim-rnaseq-pe-dutp.cwl" From 543f28f01537b7731cc801a213df63498d88c12c Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 31 Jan 2024 11:10:04 -0500 Subject: [PATCH 118/162] Fix bug in umi-tools-dedup --- tools/umi-tools-dedup.cwl | 40 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/tools/umi-tools-dedup.cwl b/tools/umi-tools-dedup.cwl index c54cc4c6..6dbdc52f 100644 --- a/tools/umi-tools-dedup.cwl +++ b/tools/umi-tools-dedup.cwl @@ -11,7 +11,6 @@ requirements: return inputs.output_filename?inputs.output_filename:root+"_dedup."+ext; }; - hints: - class: DockerRequirement dockerPull: quay.io/biocontainers/umi_tools:1.0.1--py38h0213d0e_2 @@ -27,6 +26,8 @@ inputs: umi_tools dedup --random-seed=12345 "${@:1}" else echo "Skip umi_tools dedup " ${@:1} + cp $2 $4 + cp $2.bai $4.bai fi inputBinding: position: 5 @@ -51,25 +52,25 @@ inputs: prefix: "-I" doc: "Input BAM file" + output_filename: + type: string? + inputBinding: + position: 8 + prefix: "-S" + valueFrom: $(default_output_filename()) + default: "" + doc: "Output filename" + paired_end: type: boolean? inputBinding: - position: 8 + position: 9 prefix: "--paired" doc: | Inputs BAM file is paired end - output both read pairs. This will also force the use of the template length to determine reads with the same mapping coordinates. - output_filename: - type: string? - inputBinding: - position: 9 - prefix: "-S" - valueFrom: $(default_output_filename()) - default: "" - doc: "Output filename" - output_stats: type: string? inputBinding: @@ -97,16 +98,9 @@ outputs: dedup_bam_file: type: File outputBinding: - glob: | - ${ return inputs.trigger?default_output_filename():inputs.bam_file.basename } - secondaryFiles: | - ${ - if (inputs.bam_file.secondaryFiles && inputs.trigger == false){ - return inputs.bam_file.secondaryFiles; - } else { - return "null"; - } - } + glob: $(default_output_filename()) + secondaryFiles: + - .bai output_stats: type: @@ -123,8 +117,8 @@ outputs: baseCommand: [bash, '-c'] -stdout: umi_tools_dedup_stdout_file.log -stderr: umi_tools_dedup_stderr_file.log +stdout: umi_tools_dedup_stdout.log +stderr: umi_tools_dedup_stderr.log $namespaces: From a35c24262bb4518b2e7dfa5f4f30c75aa30ab64e Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 31 Jan 2024 11:12:00 -0500 Subject: [PATCH 119/162] Add missing space --- tools/umi-tools-dedup.cwl | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/umi-tools-dedup.cwl b/tools/umi-tools-dedup.cwl index 6dbdc52f..726e1da0 100644 --- a/tools/umi-tools-dedup.cwl +++ b/tools/umi-tools-dedup.cwl @@ -11,6 +11,7 @@ requirements: return inputs.output_filename?inputs.output_filename:root+"_dedup."+ext; }; + hints: - class: DockerRequirement dockerPull: quay.io/biocontainers/umi_tools:1.0.1--py38h0213d0e_2 From 4d03bd705492aee61fcb3641e5da46543cad3c42 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 31 Jan 2024 11:12:55 -0500 Subject: [PATCH 120/162] Not important changes --- tools/umi-tools-dedup.cwl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/umi-tools-dedup.cwl b/tools/umi-tools-dedup.cwl index 726e1da0..9b1d3629 100644 --- a/tools/umi-tools-dedup.cwl +++ b/tools/umi-tools-dedup.cwl @@ -118,8 +118,8 @@ outputs: baseCommand: [bash, '-c'] -stdout: umi_tools_dedup_stdout.log -stderr: umi_tools_dedup_stderr.log +stdout: umi_tools_dedup_stdout_file.log +stderr: umi_tools_dedup_stdout_file.log $namespaces: From c1ad424195ec3de0c61228e7066bcc9f637283ed Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 31 Jan 2024 11:13:33 -0500 Subject: [PATCH 121/162] Typo --- tools/umi-tools-dedup.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/umi-tools-dedup.cwl b/tools/umi-tools-dedup.cwl index 9b1d3629..3cce5b96 100644 --- a/tools/umi-tools-dedup.cwl +++ b/tools/umi-tools-dedup.cwl @@ -119,7 +119,7 @@ outputs: baseCommand: [bash, '-c'] stdout: umi_tools_dedup_stdout_file.log -stderr: umi_tools_dedup_stdout_file.log +stderr: umi_tools_dedup_stderr_file.log $namespaces: From 06ccad7cf81ff20d07645576294a8bfc4c24bd99 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 11 Mar 2024 23:00:50 -0400 Subject: [PATCH 122/162] Update sc tools to use v0.0.34 docker image - Use intrinsicDimension to estimate dimensionality in sc_rna_reduce.R and sc_rna_cluster.R scripts if --dimensions parameter was set to 0 - Allow to show density levels on UMAP (see dim_plot function from graphics.R) - Show cell counts on the composition plots (can disabled when label parameter is set to FALSE) - Allow to set seed in all scripts - Automatically estimate --minumis, --mingenes, --maxgenes, --maxmt, and --minfragments filtering thresholds if set to 0 (related to sc_rna_filter.R and sc_multiome_filter.R scripts) - Updated plots for cell counts comparison between datasets, groups, clusters, etc in sc_[rna/atac/wnn]_cluster.R and sc_ctype_assign.R scripts - Add optional --reduction to sc_ctype_assign.R scripts to outputs plots only to selected reduction (if not provided, it will be defined based on the --source parameter) - Added "RNA reads vs mitochondrial % per cell" plot in sc_rna_filter.R and sc_multiome_filter.R scripts - Set the default normalization method in sc_rna_reduce.R to sctglm --- tools/sc-atac-cluster.cwl | 207 +++--- tools/sc-atac-coverage.cwl | 47 +- tools/sc-atac-dbinding.cwl | 85 ++- tools/sc-atac-reduce.cwl | 117 +-- tools/sc-ctype-assign.cwl | 684 ++++++++---------- tools/sc-multiome-filter.cwl | 183 +++-- tools/sc-rna-cluster.cwl | 451 +++++++----- tools/sc-rna-da-cells.cwl | 46 +- tools/sc-rna-de-pseudobulk.cwl | 63 +- tools/sc-rna-filter.cwl | 149 ++-- tools/sc-rna-reduce.cwl | 58 +- tools/sc-rna-trajectory.cwl | 49 +- tools/sc-triangulate.cwl | 69 +- tools/sc-vdj-profile.cwl | 74 +- tools/sc-wnn-cluster.cwl | 490 ++++++++----- workflows/cellranger-aggr.cwl | 36 +- workflows/cellranger-arc-aggr.cwl | 32 +- workflows/cellranger-arc-count.cwl | 50 +- workflows/cellranger-atac-aggr.cwl | 32 +- workflows/cellranger-atac-count.cwl | 44 +- workflows/cellranger-mkref.cwl | 15 +- workflows/cellranger-mkvdjref.cwl | 2 +- workflows/cellranger-multi.cwl | 52 +- workflows/cellranger-reanalyze.cwl | 74 +- workflows/fastq-download.cwl | 10 +- workflows/sc-atac-cluster.cwl | 218 +++--- workflows/sc-atac-coverage.cwl | 64 +- workflows/sc-atac-dbinding.cwl | 202 +++--- workflows/sc-atac-reduce.cwl | 141 ++-- workflows/sc-ctype-assign.cwl | 588 +++++++-------- workflows/sc-multiome-filter.cwl | 93 ++- workflows/sc-rna-cluster.cwl | 432 ++++++----- workflows/sc-rna-da-cells.cwl | 78 +- workflows/sc-rna-de-pseudobulk.cwl | 79 +- workflows/sc-rna-filter.cwl | 81 ++- workflows/sc-rna-reduce.cwl | 146 ++-- workflows/sc-rna-trajectory.cwl | 171 +++-- workflows/sc-triangulate.cwl | 104 +-- workflows/sc-vdj-profile.cwl | 173 ++--- workflows/sc-wnn-cluster.cwl | 455 +++++++----- .../single-cell-preprocess-cellranger.cwl | 4 +- 41 files changed, 3391 insertions(+), 2757 deletions(-) diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index 17ad942d..18d96e62 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.33 + dockerPull: biowardrobe2/sc-tools:v0.0.34 inputs: @@ -220,7 +220,7 @@ inputs: inputBinding: prefix: "--h5ad" doc: | - Save Seurat data to h5ad file. + Save raw counts from the ATAC assay to h5ad file. Default: false export_ucsc_cb: @@ -262,76 +262,90 @@ inputs: Number of cores/cpus to use. Default: 1 + seed: + type: int? + inputBinding: + prefix: "--seed" + doc: | + Seed number for random values. + Default: 42 + outputs: - umap_res_plot_png: + umap_gr_clst_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_res_*.png" + glob: "*_umap_gr_clst_res_*.png" doc: | - UMAP, colored by cluster. - PNG format + UMAP colored by cluster. + All cells; all resolutions. + PNG format. - umap_res_plot_pdf: + umap_gr_clst_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_res_*.pdf" + glob: "*_umap_gr_clst_res_*.pdf" doc: | - UMAP, colored by cluster. - PDF format + UMAP colored by cluster. + All cells; all resolutions. + PDF format. - slh_res_plot_png: + slh_gr_clst_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_slh_res_*.png" + glob: "*_slh_gr_clst_res_*.png" doc: | Silhouette scores. - PNG format + All cells; all resolutions. + PNG format. - slh_res_plot_pdf: + slh_gr_clst_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_slh_res_*.pdf" + glob: "*_slh_gr_clst_res_*.pdf" doc: | Silhouette scores. - PDF format + All cells; all resolutions. + PDF format. - umap_spl_idnt_res_plot_png: + umap_gr_clst_spl_idnt_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_spl_idnt_res_*.png" + glob: "*_umap_gr_clst_spl_idnt_res_*.png" doc: | - UMAP, colored by cluster, - split by dataset. - PNG format + UMAP colored by cluster. + Split by dataset; downsampled to the + smallest dataset; all resolutions. + PNG format. - umap_spl_idnt_res_plot_pdf: + umap_gr_clst_spl_idnt_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_spl_idnt_res_*.pdf" + glob: "*_umap_gr_clst_spl_idnt_res_*.pdf" doc: | - UMAP, colored by cluster, - split by dataset. - PDF format + UMAP colored by cluster. + Split by dataset; downsampled to the + smallest dataset; all resolutions. + PDF format. cmp_gr_clst_spl_idnt_res_plot_png: type: @@ -341,10 +355,10 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_idnt_res_*.png" doc: | - Composition plot, colored by - cluster, split by dataset, - downsampled. - PNG format + Composition plot colored by cluster. + Split by dataset; downsampled to the + smallest dataset; all resolutions. + PNG format. cmp_gr_clst_spl_idnt_res_plot_pdf: type: @@ -354,10 +368,10 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_idnt_res_*.pdf" doc: | - Composition plot, colored by - cluster, split by dataset, - downsampled. - PDF format + Composition plot colored by cluster. + Split by dataset; downsampled to the + smallest dataset; all resolutions. + PDF format. cmp_gr_idnt_spl_clst_res_plot_png: type: @@ -367,10 +381,10 @@ outputs: outputBinding: glob: "*_cmp_gr_idnt_spl_clst_res_*.png" doc: | - Composition plot, colored by - dataset, split by cluster, - downsampled. - PNG format + Composition plot colored by dataset. + Split by cluster; downsampled to the + smallest dataset; all resolutions. + PNG format. cmp_gr_idnt_spl_clst_res_plot_pdf: type: @@ -380,34 +394,38 @@ outputs: outputBinding: glob: "*_cmp_gr_idnt_spl_clst_res_*.pdf" doc: | - Composition plot, colored by - dataset, split by cluster, - downsampled. - PDF format + Composition plot colored by dataset. + Split by cluster; downsampled to the + smallest dataset; all resolutions. + PDF format. - umap_spl_cnd_res_plot_png: + umap_gr_clst_spl_cnd_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_spl_cnd_res_*.png" + glob: "*_umap_gr_clst_spl_cnd_res_*.png" doc: | - UMAP, colored by cluster, split - by grouping condition. - PNG format + UMAP colored by cluster. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group; all resolutions. + PNG format. - umap_spl_cnd_res_plot_pdf: + umap_gr_clst_spl_cnd_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_spl_cnd_res_*.pdf" + glob: "*_umap_gr_clst_spl_cnd_res_*.pdf" doc: | - UMAP, colored by cluster, split - by grouping condition. - PDF format + UMAP colored by cluster. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group; all resolutions. + PDF format. cmp_gr_clst_spl_cnd_res_plot_png: type: @@ -417,10 +435,11 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_cnd_res_*.png" doc: | - Composition plot, colored by - cluster, split by grouping - condition, downsampled. - PNG format + Composition plot colored by cluster. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group; all resolutions. + PNG format. cmp_gr_clst_spl_cnd_res_plot_pdf: type: @@ -430,10 +449,11 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_cnd_res_*.pdf" doc: | - Composition plot, colored by - cluster, split by grouping - condition, downsampled. - PDF format + Composition plot colored by cluster. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group; all resolutions. + PDF format. cmp_gr_cnd_spl_clst_res_plot_png: type: @@ -443,10 +463,11 @@ outputs: outputBinding: glob: "*_cmp_gr_cnd_spl_clst_res_*.png" doc: | - Composition plot, colored by - grouping condition, split by - cluster, downsampled. - PNG format + Composition plot colored by grouping condition. + Split by cluster; first downsampled to the + smallest dataset, then downsampled to the + smallest group; all resolutions. + PNG format. cmp_gr_cnd_spl_clst_res_plot_pdf: type: @@ -456,10 +477,11 @@ outputs: outputBinding: glob: "*_cmp_gr_cnd_spl_clst_res_*.pdf" doc: | - Composition plot, colored by - grouping condition, split by - cluster, downsampled. - PDF format + Composition plot colored by grouping condition. + Split by cluster; first downsampled to the + smallest dataset, then downsampled to the + smallest group; all resolutions. + PDF format. cvrg_res_plot_png: type: @@ -469,8 +491,9 @@ outputs: outputBinding: glob: "*_cvrg_res_*.png" doc: | - ATAC fragments coverage. - PNG format + ATAC fragment coverage. + All genes of interest; all resolutions. + PNG format. cvrg_res_plot_pdf: type: @@ -480,58 +503,63 @@ outputs: outputBinding: glob: "*_cvrg_res_*.pdf" doc: | - ATAC fragments coverage. - PDF format + ATAC fragment coverage. + All genes of interest; all resolutions. + PDF format. peak_markers_tsv: type: File? outputBinding: glob: "*_peak_markers.tsv" doc: | - Peak markers per cluster for all resolutions. - TSV format + Peak markers. + All resolutions. + TSV format. ucsc_cb_config_data: type: Directory? outputBinding: glob: "*_cellbrowser" doc: | - Directory with UCSC Cellbrowser configuration data. + UCSC Cell Browser configuration data. ucsc_cb_html_data: type: Directory? outputBinding: glob: "*_cellbrowser/html_data" doc: | - Directory with UCSC Cellbrowser html data. + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputBinding: glob: "*_cellbrowser/html_data/index.html" doc: | - HTML index file from the directory with UCSC Cellbrowser html data. + UCSC Cell Browser html index. seurat_data_rds: type: File outputBinding: glob: "*_data.rds" doc: | - Reduced Seurat data in RDS format + Seurat object. + RDS format. seurat_data_h5seurat: type: File? outputBinding: glob: "*_data.h5seurat" doc: | - Reduced Seurat data in h5seurat format + Seurat object. + h5Seurat format. seurat_data_h5ad: type: File? outputBinding: - glob: "*_data.h5ad" + glob: "*_counts.h5ad" doc: | - Reduced Seurat data in h5ad format + Seurat object. + H5AD format. stdout_log: type: stdout @@ -553,8 +581,8 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell ATAC-Seq Cluster Analysis" -s:name: "Single-cell ATAC-Seq Cluster Analysis" +label: "Single-Cell ATAC-Seq Cluster Analysis" +s:name: "Single-Cell ATAC-Seq Cluster Analysis" s:alternateName: "Clusters single-cell ATAC-Seq datasets, identifies differentially accessible peaks" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-atac-cluster.cwl @@ -593,14 +621,14 @@ s:creator: doc: | - Single-cell ATAC-Seq Cluster Analysis + Single-Cell ATAC-Seq Cluster Analysis Clusters single-cell ATAC-Seq datasets, identifies differentially accessible peaks. s:about: | - usage: sc_atac_cluster.R [-h] --query QUERY + usage: /usr/local/bin/sc_atac_cluster.R [-h] --query QUERY [--dimensions DIMENSIONS] [--ametric {euclidean,cosine,manhattan,hamming}] [--algorithm {louvain,mult-louvain,slm,leiden}] @@ -616,8 +644,9 @@ s:about: | [--h5ad] [--cbbuild] [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] + [--seed SEED] - Single-cell ATAC-Seq Cluster Analysis + Single-Cell ATAC-Seq Cluster Analysis optional arguments: -h, --help show this help message and exit @@ -683,11 +712,13 @@ s:about: | --pdf Export plots in PDF. Default: false --verbose Print debug information. Default: false --h5seurat Save Seurat data to h5seurat file. Default: false - --h5ad Save Seurat data to h5ad file. Default: false + --h5ad Save raw counts from the ATAC assay to h5ad file. + Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --output OUTPUT Output prefix. Default: ./sc --theme {gray,bw,linedraw,light,dark,minimal,classic,void} Color theme for all generated plots. Default: classic --cpus CPUS Number of cores/cpus to use. Default: 1 --memory MEMORY Maximum memory in GB allowed to be shared between the - workers when using multiple --cpus. Default: 32 \ No newline at end of file + workers when using multiple --cpus. Default: 32 + --seed SEED Seed number for random values. Default: 42 \ No newline at end of file diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index 79d61bda..5f97216b 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.33 + dockerPull: biowardrobe2/sc-tools:v0.0.34 inputs: @@ -120,6 +120,14 @@ inputs: Number of cores/cpus to use. Default: 1 + seed: + type: int? + inputBinding: + prefix: "--seed" + doc: | + Seed number for random values. + Default: 42 + outputs: @@ -129,7 +137,7 @@ outputs: glob: "*_peaks.bigBed" doc: | Locations of open-chromatin regions ("peaks") - in bigBed format + in bigBed format. cut_sites_bigwig_file: type: @@ -140,7 +148,7 @@ outputs: glob: "*_cut_cov.bigWig" doc: | Genome coverage calculated for Tn5 cut sites - in bigWig format + in bigWig format. fragments_bigwig_file: type: @@ -151,7 +159,7 @@ outputs: glob: "*_frg_cov.bigWig" doc: | Genome coverage calculated for ATAC fragments - in bigWig format + in bigWig format. stdout_log: type: stdout @@ -173,8 +181,8 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell ATAC-Seq Genome Coverage" -s:name: "Single-cell ATAC-Seq Genome Coverage" +label: "Single-Cell ATAC-Seq Genome Coverage" +s:name: "Single-Cell ATAC-Seq Genome Coverage" s:alternateName: "Creates genome coverage bigWig files from the provided ATAC fragments file and selected grouping parameters" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-atac-coverage.cwl @@ -213,7 +221,7 @@ s:creator: doc: | - Single-cell ATAC-Seq Genome Coverage + Single-Cell ATAC-Seq Genome Coverage Creates genome coverage bigWig files from the provided ATAC fragments file and selected grouping parameters. @@ -222,14 +230,18 @@ doc: | s:about: | - usage: sc_atac_coverage.R - [-h] --query QUERY --fragments FRAGMENTS [--splitby [SPLITBY ...]] - [--metadata METADATA] [--barcodes BARCODES] [--flank FLANK] [--verbose] - [--tmpdir TMPDIR] [--output OUTPUT] [--cpus CPUS] [--memory MEMORY] - - Single-cell ATAC-Seq Genome Coverage - - options: + usage: /usr/local/bin/sc_atac_coverage.R [-h] --query QUERY --fragments + FRAGMENTS + [--splitby [SPLITBY [SPLITBY ...]]] + [--metadata METADATA] + [--barcodes BARCODES] [--flank FLANK] + [--verbose] [--tmpdir TMPDIR] + [--output OUTPUT] [--cpus CPUS] + [--memory MEMORY] [--seed SEED] + + Single-Cell ATAC-Seq Genome Coverage + + optional arguments: -h, --help show this help message and exit --query QUERY Path to the RDS file to load Seurat object from. This file should include chromatin accessibility @@ -239,7 +251,7 @@ s:about: | Count and barcode information for every ATAC fragment used in the loaded Seurat object. File should be saved in TSV format and to be tbi-indexed. - --splitby [SPLITBY ...] + --splitby [SPLITBY [SPLITBY ...]] Column from the Seurat object metadata to split cells into groups. May be one of the columns added with --metadata or --barcodes parameters. Default: split by @@ -271,4 +283,5 @@ s:about: | --output OUTPUT Output prefix. Default: ./sc --cpus CPUS Number of cores/cpus to use. Default: 1 --memory MEMORY Maximum memory in GB allowed to be shared between the - workers when using multiple --cpus. Default: 32 \ No newline at end of file + workers when using multiple --cpus. Default: 32 + --seed SEED Seed number for random values. Default: 42 \ No newline at end of file diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index d848a15a..bc73f8ff 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.33 + dockerPull: biowardrobe2/sc-tools:v0.0.34 inputs: @@ -211,7 +211,7 @@ inputs: prefix: "--padj" doc: | In the exploratory visualization part of the analysis - output only differentially bound peaks with adjusted + output only differentially accessible regions with adjusted P-value not bigger than this value. Default: 0.05 minimum_logfc: @@ -220,7 +220,7 @@ inputs: prefix: "--logfc" doc: | In the exploratory visualization part of the analysis - output only differentially bound peaks with log2 Fold + output only differentially accessible regions with log2 Fold Change not smaller than this value. Default: 1.0 export_pdf_plots: @@ -291,6 +291,14 @@ inputs: Number of cores/cpus to use. Default: 1 + seed: + type: int? + inputBinding: + prefix: "--seed" + doc: | + Seed number for random values. + Default: 42 + outputs: @@ -302,7 +310,7 @@ outputs: Cells UMAP split by selected criteria, optionally subsetted to the specific group (rnaumap dim. reduction). - PNG format + PNG format. umap_rd_rnaumap_plot_pdf: type: File? @@ -312,7 +320,7 @@ outputs: Cells UMAP split by selected criteria, optionally subsetted to the specific group (rnaumap dim. reduction). - PDF format + PDF format. umap_rd_atacumap_plot_png: type: File? @@ -322,7 +330,7 @@ outputs: Cells UMAP split by selected criteria, optionally subsetted to the specific group (atacumap dim. reduction). - PNG format + PNG format. umap_rd_atacumap_plot_pdf: type: File? @@ -332,7 +340,7 @@ outputs: Cells UMAP split by selected criteria, optionally subsetted to the specific group (atacumap dim. reduction). - PDF format + PDF format. umap_rd_wnnumap_plot_png: type: File? @@ -342,7 +350,7 @@ outputs: Cells UMAP split by selected criteria, optionally subsetted to the specific group (wnnumap dim. reduction). - PNG format + PNG format. umap_rd_wnnumap_plot_pdf: type: File? @@ -352,7 +360,7 @@ outputs: Cells UMAP split by selected criteria, optionally subsetted to the specific group (wnnumap dim. reduction). - PDF format + PDF format. seurat_peaks_bigbed_file: type: File? @@ -468,24 +476,24 @@ outputs: outputBinding: glob: "*_db_sites.tsv" doc: | - Not filtered differentially bound sites - in TSV format + Not filtered differentially accessible regions. + TSV format. dbnd_vlcn_plot_png: type: File? outputBinding: glob: "*_dbnd_vlcn.png" doc: | - Volcano plot of differentially bound sites. - PNG format + Volcano plot of differentially accessible regions. + PNG format. dbnd_vlcn_plot_pdf: type: File? outputBinding: glob: "*_dbnd_vlcn.pdf" doc: | - Volcano plot of differentially bound sites. - PDF format + Volcano plot of differentially accessible regions. + PDF format. first_enrch_bigbed_file: type: File? @@ -547,9 +555,9 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell ATAC-Seq Differential Binding Analysis" -s:name: "Single-cell ATAC-Seq Differential Binding Analysis" -s:alternateName: "Identifies differential bound sites between two groups of cells" +label: "Single-Cell ATAC-Seq Differential Accessibility Analysis" +s:name: "Single-Cell ATAC-Seq Differential Accessibility Analysis" +s:alternateName: "Identifies differentially accessible regions between two groups of cells" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-atac-dbinding.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -587,27 +595,33 @@ s:creator: doc: | - Single-cell ATAC-Seq Differential Binding Analysis + Single-Cell ATAC-Seq Differential Accessibility Analysis - Identifies differential bound sites between two groups of cells + Identifies differentially accessible regions between two groups of cells --tmpdir parameter is not exposed as input. s:about: | - usage: sc_atac_dbinding.R [-h] --query QUERY --fragments FRAGMENTS - [--metadata METADATA] [--barcodes BARCODES] - [--groupby GROUPBY] [--subset [SUBSET [SUBSET ...]]] - --splitby SPLITBY --first FIRST --second SECOND - [--test {negative-binomial,poisson,logistic-regression,mast,manorm2}] - [--genome {hs,mm}] [--qvalue QVALUE] - [--minpeakgap MINPEAKGAP] [--binsize BINSIZE] - [--maxpeaks MAXPEAKS] [--blacklist BLACKLIST] - [--padj PADJ] [--logfc LOGFC] [--pdf] [--verbose] - [--tmpdir TMPDIR] [--output OUTPUT] - [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] - [--cpus CPUS] [--memory MEMORY] - - Single-cell ATAC-Seq Differential Binding Analysis + usage: /usr/local/bin/sc_atac_dbinding.R [-h] --query QUERY --fragments + FRAGMENTS [--metadata METADATA] + [--barcodes BARCODES] + [--groupby GROUPBY] + [--subset [SUBSET [SUBSET ...]]] + --splitby SPLITBY --first FIRST + --second SECOND + [--test {negative-binomial,poisson,logistic-regression,mast,manorm2}] + [--genome {hs,mm}] [--qvalue QVALUE] + [--minpeakgap MINPEAKGAP] + [--binsize BINSIZE] + [--maxpeaks MAXPEAKS] + [--blacklist BLACKLIST] [--padj PADJ] + [--logfc LOGFC] [--pdf] [--verbose] + [--tmpdir TMPDIR] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + [--seed SEED] + + Single-Cell ATAC-Seq Differential Accessibility Analysis optional arguments: -h, --help show this help message and exit @@ -707,4 +721,5 @@ s:about: | Color theme for all generated plots. Default: classic --cpus CPUS Number of cores/cpus to use. Default: 1 --memory MEMORY Maximum memory in GB allowed to be shared between the - workers when using multiple --cpus. Default: 32 \ No newline at end of file + workers when using multiple --cpus. Default: 32 + --seed SEED Seed number for random values. Default: 42 \ No newline at end of file diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index f8f7bc81..8a5f3f25 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.33 + dockerPull: biowardrobe2/sc-tools:v0.0.34 inputs: @@ -251,7 +251,7 @@ inputs: inputBinding: prefix: "--h5ad" doc: | - Save Seurat data to h5ad file. + Save raw counts from the ATAC assay to h5ad file. Default: false export_ucsc_cb: @@ -293,6 +293,14 @@ inputs: Number of cores/cpus to use. Default: 1 + seed: + type: int? + inputBinding: + prefix: "--seed" + doc: | + Seed number for random values. + Default: 42 + outputs: @@ -302,7 +310,7 @@ outputs: glob: "*_qc_dim_corr.png" doc: | Correlation between QC metrics and LSI components. - PNG format + PNG format. qc_dim_corr_plot_pdf: type: File? @@ -310,7 +318,7 @@ outputs: glob: "*_qc_dim_corr.pdf" doc: | Correlation between QC metrics and LSI components. - PDF format + PDF format. umap_qc_mtrcs_plot_png: type: File? @@ -318,7 +326,7 @@ outputs: glob: "*_umap_qc_mtrcs.png" doc: | UMAP, QC metrics. - PNG format + PNG format. umap_qc_mtrcs_plot_pdf: type: File? @@ -326,7 +334,7 @@ outputs: glob: "*_umap_qc_mtrcs.pdf" doc: | UMAP, QC metrics. - PDF format + PDF format. umap_plot_png: type: File? @@ -334,7 +342,7 @@ outputs: glob: "*_umap.png" doc: | UMAP, colored by dataset. - PNG format + PNG format. umap_plot_pdf: type: File? @@ -342,7 +350,7 @@ outputs: glob: "*_umap.pdf" doc: | UMAP, colored by dataset. - PDF format + PDF format. umap_spl_idnt_plot_png: type: File? @@ -350,7 +358,7 @@ outputs: glob: "*_umap_spl_idnt.png" doc: | UMAP, split by dataset. - PNG format + PNG format. umap_spl_idnt_plot_pdf: type: File? @@ -358,7 +366,7 @@ outputs: glob: "*_umap_spl_idnt.pdf" doc: | UMAP, split by dataset. - PDF format + PDF format. umap_spl_cnd_plot_png: type: File? @@ -367,7 +375,7 @@ outputs: doc: | UMAP, colored by dataset, split by grouping condition. - PNG format + PNG format. umap_spl_cnd_plot_pdf: type: File? @@ -376,7 +384,7 @@ outputs: doc: | UMAP, colored by dataset, split by grouping condition. - PDF format + PDF format. umap_spl_frgm_plot_png: type: File? @@ -385,7 +393,7 @@ outputs: doc: | UMAP, colored by dataset, split by ATAC fragments in peaks per cell. - PNG format + PNG format. umap_spl_frgm_plot_pdf: type: File? @@ -394,7 +402,7 @@ outputs: doc: | UMAP, colored by dataset, split by ATAC fragments in peaks per cell. - PDF format + PDF format. umap_spl_peak_plot_png: type: File? @@ -403,7 +411,7 @@ outputs: doc: | UMAP, colored by dataset, split by peaks per cell. - PNG format + PNG format. umap_spl_peak_plot_pdf: type: File? @@ -412,7 +420,7 @@ outputs: doc: | UMAP, colored by dataset, split by peaks per cell. - PDF format + PDF format. umap_spl_tss_plot_png: type: File? @@ -421,7 +429,7 @@ outputs: doc: | UMAP, colored by dataset, split by TSS enrichment score. - PNG format + PNG format. umap_spl_tss_plot_pdf: type: File? @@ -430,7 +438,7 @@ outputs: doc: | UMAP, colored by dataset, split by TSS enrichment score. - PDF format + PDF format. umap_spl_ncls_plot_png: type: File? @@ -439,7 +447,7 @@ outputs: doc: | UMAP, colored by dataset, split by nucleosome signal. - PNG format + PNG format. umap_spl_ncls_plot_pdf: type: File? @@ -448,7 +456,7 @@ outputs: doc: | UMAP, colored by dataset, split by nucleosome signal. - PDF format + PDF format. umap_spl_frip_plot_png: type: File? @@ -457,7 +465,7 @@ outputs: doc: | UMAP, colored by dataset, split by FRiP. - PNG format + PNG format. umap_spl_frip_plot_pdf: type: File? @@ -466,7 +474,7 @@ outputs: doc: | UMAP, colored by dataset, split by FRiP. - PDF format + PDF format. umap_spl_blck_plot_png: type: File? @@ -475,7 +483,7 @@ outputs: doc: | UMAP, colored by dataset, split by blacklist fraction. - PNG format + PNG format. umap_spl_blck_plot_pdf: type: File? @@ -484,7 +492,7 @@ outputs: doc: | UMAP, colored by dataset, split by blacklist fraction. - PDF format + PDF format. umap_gr_cnd_spl_frgm_plot_png: type: File? @@ -493,7 +501,7 @@ outputs: doc: | UMAP, colored by grouping condition, split by ATAC fragments in peaks per cell. - PNG format + PNG format. umap_gr_cnd_spl_frgm_plot_pdf: type: File? @@ -502,7 +510,7 @@ outputs: doc: | UMAP, colored by grouping condition, split by ATAC fragments in peaks per cell. - PDF format + PDF format. umap_gr_cnd_spl_peak_plot_png: type: File? @@ -511,7 +519,7 @@ outputs: doc: | UMAP, colored by grouping condition, split by peaks per cell. - PNG format + PNG format. umap_gr_cnd_spl_peak_plot_pdf: type: File? @@ -520,7 +528,7 @@ outputs: doc: | UMAP, colored by grouping condition, split by peaks per cell. - PDF format + PDF format. umap_gr_cnd_spl_tss_plot_png: type: File? @@ -529,7 +537,7 @@ outputs: doc: | UMAP, colored by grouping condition, split by TSS enrichment score. - PNG format + PNG format. umap_gr_cnd_spl_tss_plot_pdf: type: File? @@ -538,7 +546,7 @@ outputs: doc: | UMAP, colored by grouping condition, split by TSS enrichment score. - PDF format + PDF format. umap_gr_cnd_spl_ncls_plot_png: type: File? @@ -547,7 +555,7 @@ outputs: doc: | UMAP, colored by grouping condition, split by nucleosome signal. - PNG format + PNG format. umap_gr_cnd_spl_ncls_plot_pdf: type: File? @@ -556,7 +564,7 @@ outputs: doc: | UMAP, colored by grouping condition, split by nucleosome signal. - PDF format + PDF format. umap_gr_cnd_spl_frip_plot_png: type: File? @@ -565,7 +573,7 @@ outputs: doc: | UMAP, colored by grouping condition, split by FRiP. - PNG format + PNG format. umap_gr_cnd_spl_frip_plot_pdf: type: File? @@ -574,7 +582,7 @@ outputs: doc: | UMAP, colored by grouping condition, split by FRiP. - PDF format + PDF format. umap_gr_cnd_spl_blck_plot_png: type: File? @@ -583,7 +591,7 @@ outputs: doc: | UMAP, colored by grouping condition, split by blacklist fraction. - PNG format + PNG format. umap_gr_cnd_spl_blck_plot_pdf: type: File? @@ -592,52 +600,52 @@ outputs: doc: | UMAP, colored by grouping condition, split by blacklist fraction. - PDF format + PDF format. ucsc_cb_config_data: type: Directory? outputBinding: glob: "*_cellbrowser" doc: | - Directory with UCSC Cellbrowser - configuration data. + UCSC Cell Browser configuration data. ucsc_cb_html_data: type: Directory? outputBinding: glob: "*_cellbrowser/html_data" doc: | - Directory with UCSC Cellbrowser - html data. + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputBinding: glob: "*_cellbrowser/html_data/index.html" doc: | - HTML index file from the directory - with UCSC Cellbrowser html data. + UCSC Cell Browser html index. seurat_data_rds: type: File outputBinding: glob: "*_data.rds" doc: | - Reduced Seurat data in RDS format + Seurat object. + RDS format. seurat_data_h5seurat: type: File? outputBinding: glob: "*_data.h5seurat" doc: | - Reduced Seurat data in h5seurat format + Seurat object. + h5Seurat format. seurat_data_h5ad: type: File? outputBinding: - glob: "*_data.h5ad" + glob: "*_counts.h5ad" doc: | - Reduced Seurat data in h5ad format + Seurat object. + H5AD format. stdout_log: type: stdout @@ -659,8 +667,8 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell ATAC-Seq Dimensionality Reduction Analysis" -s:name: "Single-cell ATAC-Seq Dimensionality Reduction Analysis" +label: "Single-Cell ATAC-Seq Dimensionality Reduction Analysis" +s:name: "Single-Cell ATAC-Seq Dimensionality Reduction Analysis" s:alternateName: "Integrates multiple single-cell ATAC-Seq datasets, reduces dimensionality using LSI" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-atac-reduce.cwl @@ -699,13 +707,13 @@ s:creator: doc: | - Single-cell ATAC-Seq Dimensionality Reduction Analysis + Single-Cell ATAC-Seq Dimensionality Reduction Analysis Integrates multiple single-cell ATAC-Seq datasets, reduces dimensionality using LSI. s:about: | - usage: sc_atac_reduce.R [-h] --query QUERY + usage: /usr/local/bin/sc_atac_reduce.R [-h] --query QUERY [--metadata METADATA] [--barcodes BARCODES] [--norm {log-tfidf,tf-logidf,logtf-logidf,idf}] @@ -722,8 +730,9 @@ s:about: | [--h5ad] [--cbbuild] [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] + [--seed SEED] - Single-cell ATAC-Seq Dimensionality Reduction Analysis + Single-Cell ATAC-Seq Dimensionality Reduction Analysis optional arguments: -h, --help show this help message and exit @@ -800,11 +809,13 @@ s:about: | --pdf Export plots in PDF. Default: false --verbose Print debug information. Default: false --h5seurat Save Seurat data to h5seurat file. Default: false - --h5ad Save Seurat data to h5ad file. Default: false + --h5ad Save raw counts from the ATAC assay to h5ad file. + Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --output OUTPUT Output prefix. Default: ./sc --theme {gray,bw,linedraw,light,dark,minimal,classic,void} Color theme for all generated plots. Default: classic --cpus CPUS Number of cores/cpus to use. Default: 1 --memory MEMORY Maximum memory in GB allowed to be shared between the - workers when using multiple --cpus. Default: 32 \ No newline at end of file + workers when using multiple --cpus. Default: 32 + --seed SEED Seed number for random values. Default: 42 \ No newline at end of file diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index 11f697de..0e193174 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.33 + dockerPull: biowardrobe2/sc-tools:v0.0.34 inputs: @@ -23,8 +23,7 @@ inputs: doc: | Path to the RDS file to load Seurat object from. This file should include genes expression and/or chromatin accessibility information stored in the RNA - and ATAC assays correspondingly. Additionally, 'rnaumap', and/or 'atacumap', - and/or 'wnnumap' dimensionality reductions should be present. + and ATAC assays correspondingly. cell_type_data: type: File @@ -50,6 +49,17 @@ inputs: assigned cell types. Should start with 'custom_', otherwise, it won't be shown in UCSC Cell Browser. + reduction: + type: string? + inputBinding: + prefix: "--reduction" + doc: | + Dimensionality reduction to be used in the generated plots. If not + provided it will be automatically defined on the basis of the --source + parameter as follows: rna_res.* - rnaumap, atac_res.* - atacumap, + wsnn_res.* - wnnumap. + Default: defined automatically + query_splitby_column: type: string? inputBinding: @@ -266,7 +276,7 @@ inputs: inputBinding: prefix: "--h5ad" doc: | - Save Seurat data to h5ad file. + Save raw counts from the RNA and/or ATAC assay(s) to h5ad file(s). Default: false export_scope_data: @@ -320,506 +330,403 @@ inputs: Number of cores/cpus to use. Default: 1 - -outputs: - - umap_rd_rnaumap_plot_png: - type: File? - outputBinding: - glob: "*_umap_rd_rnaumap.png" - doc: | - UMAP, colored by cell type, RNA. - PNG format - - umap_rd_rnaumap_plot_pdf: - type: File? - outputBinding: - glob: "*_umap_rd_rnaumap.pdf" - doc: | - UMAP, colored by cell type, RNA. - PDF format - - umap_rd_atacumap_plot_png: - type: File? - outputBinding: - glob: "*_umap_rd_atacumap.png" - doc: | - UMAP, colored by cell type, ATAC. - PNG format - - umap_rd_atacumap_plot_pdf: - type: File? - outputBinding: - glob: "*_umap_rd_atacumap.pdf" - doc: | - UMAP, colored by cell type, ATAC. - PDF format - - umap_rd_wnnumap_plot_png: - type: File? - outputBinding: - glob: "*_umap_rd_wnnumap.png" - doc: | - UMAP, colored by cell type, WNN. - PNG format - - umap_rd_wnnumap_plot_pdf: - type: File? - outputBinding: - glob: "*_umap_rd_wnnumap.pdf" - doc: | - UMAP, colored by cell type, WNN. - PDF format - - umap_spl_idnt_rd_rnaumap_plot_png: - type: File? - outputBinding: - glob: "*_umap_spl_idnt_rd_rnaumap.png" - doc: | - UMAP, colored by cell type, - split by dataset, RNA. - PNG format - - umap_spl_idnt_rd_rnaumap_plot_pdf: - type: File? - outputBinding: - glob: "*_umap_spl_idnt_rd_rnaumap.pdf" - doc: | - UMAP, colored by cell type, - split by dataset, RNA. - PDF format - - umap_spl_idnt_rd_atacumap_plot_png: - type: File? - outputBinding: - glob: "*_umap_spl_idnt_rd_atacumap.png" - doc: | - UMAP, colored by cell type, - split by dataset, ATAC. - PNG format - - umap_spl_idnt_rd_atacumap_plot_pdf: - type: File? - outputBinding: - glob: "*_umap_spl_idnt_rd_atacumap.pdf" + seed: + type: int? + inputBinding: + prefix: "--seed" doc: | - UMAP, colored by cell type, - split by dataset, ATAC. - PDF format + Seed number for random values. + Default: 42 - umap_spl_idnt_rd_wnnumap_plot_png: - type: File? - outputBinding: - glob: "*_umap_spl_idnt_rd_wnnumap.png" - doc: | - UMAP, colored by cell type, - split by dataset, WNN. - PNG format - umap_spl_idnt_rd_wnnumap_plot_pdf: - type: File? - outputBinding: - glob: "*_umap_spl_idnt_rd_wnnumap.pdf" - doc: | - UMAP, colored by cell type, - split by dataset, WNN. - PDF format +outputs: - umap_spl_cnd_rd_rnaumap_plot_png: + umap_gr_ctyp_plot_png: type: File? outputBinding: - glob: "*_umap_spl_cnd_rd_rnaumap.png" + glob: "*_umap_gr_ctyp.png" doc: | - UMAP, colored by cell type, split - by grouping condition, RNA. - PNG format + UMAP colored by cell type. + All cells. + PNG format. - umap_spl_cnd_rd_rnaumap_plot_pdf: + umap_gr_ctyp_plot_pdf: type: File? outputBinding: - glob: "*_umap_spl_cnd_rd_rnaumap.pdf" + glob: "*_umap_gr_ctyp.pdf" doc: | - UMAP, colored by cell type, split - by grouping condition, RNA. - PDF format + UMAP colored by cell type. + All cells. + PDF format. - umap_spl_cnd_rd_atacumap_plot_png: + umap_gr_ctyp_spl_idnt_plot_png: type: File? outputBinding: - glob: "*_umap_spl_cnd_rd_atacumap.png" + glob: "*_umap_gr_ctyp_spl_idnt.png" doc: | - UMAP, colored by cell type, split - by grouping condition, ATAC. - PNG format + UMAP colored by cell type. + Split by dataset; downsampled to the + smallest dataset. + PNG format. - umap_spl_cnd_rd_atacumap_plot_pdf: + umap_gr_ctyp_spl_idnt_plot_pdf: type: File? outputBinding: - glob: "*_umap_spl_cnd_rd_atacumap.pdf" + glob: "*_umap_gr_ctyp_spl_idnt.pdf" doc: | - UMAP, colored by cell type, split - by grouping condition, ATAC. - PDF format + UMAP colored by cell type. + Split by dataset; downsampled to the + smallest dataset. + PDF format. - umap_spl_cnd_rd_wnnumap_plot_png: + cmp_gr_ctyp_spl_idnt_plot_png: type: File? outputBinding: - glob: "*_umap_spl_cnd_rd_wnnumap.png" + glob: "*_cmp_gr_ctyp_spl_idnt.png" doc: | - UMAP, colored by cell type, split - by grouping condition, WNN. - PNG format + Composition plot colored by cell type. + Split by dataset; downsampled to the + smallest dataset. + PNG format. - umap_spl_cnd_rd_wnnumap_plot_pdf: + cmp_gr_ctyp_spl_idnt_plot_pdf: type: File? outputBinding: - glob: "*_umap_spl_cnd_rd_wnnumap.pdf" + glob: "*_cmp_gr_ctyp_spl_idnt.pdf" doc: | - UMAP, colored by cell type, split - by grouping condition, WNN. - PDF format + Composition plot colored by cell type. + Split by dataset; downsampled to the + smallest dataset. + PDF format. - umap_spl_ph_rd_rnaumap_plot_png: + cmp_gr_idnt_spl_ctyp_plot_png: type: File? outputBinding: - glob: "*_umap_spl_ph_rd_rnaumap.png" + glob: "*_cmp_gr_idnt_spl_ctyp.png" doc: | - UMAP, colored by cell type, split - by cell cycle phase, RNA. - PNG format + Composition plot colored by dataset. + Split by cell type; downsampled to + the smallest dataset. + PNG format. - umap_spl_ph_rd_rnaumap_plot_pdf: + cmp_gr_idnt_spl_ctyp_plot_pdf: type: File? outputBinding: - glob: "*_umap_spl_ph_rd_rnaumap.pdf" + glob: "*_cmp_gr_idnt_spl_ctyp.pdf" doc: | - UMAP, colored by cell type, split - by cell cycle phase, RNA. - PDF format + Composition plot colored by dataset. + Split by cell type; downsampled to + the smallest dataset. + PDF format. - umap_spl_ph_rd_atacumap_plot_png: + umap_gr_ph_spl_idnt_plot_png: type: File? outputBinding: - glob: "*_umap_spl_ph_rd_atacumap.png" + glob: "*_umap_gr_ph_spl_idnt.png" doc: | - UMAP, colored by cell type, split - by cell cycle phase, ATAC. - PNG format + UMAP colored by cell cycle phase. + Split by dataset; downsampled to the + smallest dataset. + PNG format. - umap_spl_ph_rd_atacumap_plot_pdf: + umap_gr_ph_spl_idnt_plot_pdf: type: File? outputBinding: - glob: "*_umap_spl_ph_rd_atacumap.pdf" + glob: "*_umap_gr_ph_spl_idnt.pdf" doc: | - UMAP, colored by cell type, split - by cell cycle phase, ATAC. - PDF format + UMAP colored by cell cycle phase. + Split by dataset; downsampled to the + smallest dataset. + PDF format. - umap_spl_ph_rd_wnnumap_plot_png: + cmp_gr_ph_spl_idnt_plot_png: type: File? outputBinding: - glob: "*_umap_spl_ph_rd_wnnumap.png" + glob: "*_cmp_gr_ph_spl_idnt.png" doc: | - UMAP, colored by cell type, split - by cell cycle phase, WNN. - PNG format + Composition plot colored by cell cycle phase. + Split by dataset; downsampled to the smallest + dataset. + PNG format. - umap_spl_ph_rd_wnnumap_plot_pdf: + cmp_gr_ph_spl_idnt_plot_pdf: type: File? outputBinding: - glob: "*_umap_spl_ph_rd_wnnumap.pdf" + glob: "*_cmp_gr_ph_spl_idnt.pdf" doc: | - UMAP, colored by cell type, split - by cell cycle phase, WNN. - PDF format + Composition plot colored by cell cycle phase. + Split by dataset; downsampled to the smallest + dataset. + PDF format. - cmp_gr_ctyp_spl_idnt_plot_png: + umap_gr_ctyp_spl_ph_png: type: File? outputBinding: - glob: "*_cmp_gr_ctyp_spl_idnt.png" + glob: "*_umap_gr_ctyp_spl_ph.png" doc: | - Composition plot, colored by cell - type, split by dataset, downsampled. - PNG format + UMAP colored by cell type. + Split by cell cycle phase; downsampled + to the smallest dataset (if multiple + datasets are analyzed jointly). + PNG format. - cmp_gr_ctyp_spl_idnt_plot_pdf: + umap_gr_ctyp_spl_ph_plot_pdf: type: File? outputBinding: - glob: "*_cmp_gr_ctyp_spl_idnt.pdf" + glob: "*_umap_gr_ctyp_spl_ph.pdf" doc: | - Composition plot, colored by cell - type, split by dataset, downsampled. - PDF format + UMAP colored by cell type. + Split by cell cycle phase; downsampled + to the smallest dataset (if multiple + datasets are analyzed jointly). + PDF format. - cmp_gr_idnt_spl_ctyp_plot_png: + cmp_gr_ph_spl_ctyp_png: type: File? outputBinding: - glob: "*_cmp_gr_idnt_spl_ctyp.png" + glob: "*_cmp_gr_ph_spl_ctyp.png" doc: | - Composition plot, colored by - dataset, split by cell type, - downsampled. - PNG format + Composition plot colored by cell cycle phase. + Split by cell type; downsampled to the + smallest dataset (if multiple datasets are + analyzed jointly). + PNG format. - cmp_gr_idnt_spl_ctyp_plot_pdf: + cmp_gr_ph_spl_ctyp_plot_pdf: type: File? outputBinding: - glob: "*_cmp_gr_idnt_spl_ctyp.pdf" + glob: "*_cmp_gr_ph_spl_ctyp.pdf" doc: | - Composition plot, colored by - dataset, split by cell type, - downsampled. - PDF format + Composition plot colored by cell cycle phase. + Split by cell type; downsampled to the + smallest dataset (if multiple datasets are + analyzed jointly). + PDF format. - cmp_gr_ph_spl_idnt_plot_png: + umap_gr_ctyp_spl_cnd_plot_png: type: File? outputBinding: - glob: "*_cmp_gr_ph_spl_idnt.png" + glob: "*_umap_gr_ctyp_spl_cnd.png" doc: | - Composition plot, colored by cell - cycle phase, split by dataset, - downsampled. - PNG format + UMAP colored by cell type. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. - cmp_gr_ph_spl_idnt_plot_pdf: + umap_gr_ctyp_spl_cnd_plot_pdf: type: File? outputBinding: - glob: "*_cmp_gr_ph_spl_idnt.pdf" + glob: "*_umap_gr_ctyp_spl_cnd.pdf" doc: | - Composition plot, colored by cell - cycle phase, split by dataset, - downsampled. - PDF format + UMAP colored by cell type. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PDF format. cmp_gr_ctyp_spl_cnd_plot_png: type: File? outputBinding: glob: "*_cmp_gr_ctyp_spl_cnd.png" doc: | - Composition plot, colored by cell - type, split by grouping condition, - downsampled. - PNG format + Composition plot colored by cell type. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. cmp_gr_ctyp_spl_cnd_plot_pdf: type: File? outputBinding: glob: "*_cmp_gr_ctyp_spl_cnd.pdf" doc: | - Composition plot, colored by cell - type, split by grouping condition, - downsampled. - PDF format + Composition plot colored by cell type. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PDF format. cmp_gr_cnd_spl_ctyp_plot_png: type: File? outputBinding: glob: "*_cmp_gr_cnd_spl_ctyp.png" doc: | - Composition plot, colored by - grouping condition, split by - cell type, downsampled. - PNG format + Composition plot colored by grouping condition. + Split by cell type; first downsampled to the + smallest dataset, then downsampled to the + smallest group. + PNG format. cmp_gr_cnd_spl_ctyp_plot_pdf: type: File? outputBinding: glob: "*_cmp_gr_cnd_spl_ctyp.pdf" doc: | - Composition plot, colored by - grouping condition, split by - cell type, downsampled. - PDF format + Composition plot colored by grouping condition. + Split by cell type; first downsampled to the + smallest dataset, then downsampled to the + smallest group. + PDF format. - cmp_gr_ph_spl_ctyp_plot_png: + umap_gr_ph_spl_cnd_plot_png: type: File? outputBinding: - glob: "*_cmp_gr_ph_spl_ctyp.png" + glob: "*_umap_gr_ph_spl_cnd.png" doc: | - Composition plot, colored by cell - cycle phase, split by cell type, - downsampled. - PNG format + UMAP colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. - cmp_gr_ph_spl_ctyp_plot_pdf: + umap_gr_ph_spl_cnd_plot_pdf: type: File? outputBinding: - glob: "*_cmp_gr_ph_spl_ctyp.pdf" + glob: "*_umap_gr_ph_spl_cnd.pdf" doc: | - Composition plot, colored by cell - cycle phase, split by cell type, - downsampled. - PDF format + UMAP colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PDF format. - xpr_avg_plot_png: + cmp_gr_ph_spl_cnd_plot_png: type: File? outputBinding: - glob: "*_xpr_avg.png" + glob: "*_cmp_gr_ph_spl_cnd.png" doc: | - Gene expression dot plot. - PNG format + Composition plot colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. - xpr_avg_plot_pdf: + cmp_gr_ph_spl_cnd_plot_pdf: type: File? outputBinding: - glob: "*_xpr_avg.pdf" + glob: "*_cmp_gr_ph_spl_cnd.pdf" doc: | - Gene expression dot plot. - PDF format + Composition plot colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PDF format. - xpr_dnst_plot_png: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_xpr_dnst_*.png" - doc: | - Gene expression violin plot. - PNG format - - xpr_dnst_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_xpr_dnst_*.pdf" - doc: | - Gene expression violin plot. - PDF format - - xpr_per_cell_rd_rnaumap_plot_png: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_xpr_per_cell_rd_rnaumap_*.png" - doc: | - UMAP, gene expression, RNA. - PNG format - - xpr_per_cell_rd_rnaumap_plot_pdf: - type: - - "null" - - type: array - items: File + xpr_avg_plot_png: + type: File? outputBinding: - glob: "*_xpr_per_cell_rd_rnaumap_*.pdf" + glob: "*_xpr_avg.png" doc: | - UMAP, gene expression, RNA. - PDF format + Average gene expression. + PNG format. - xpr_per_cell_rd_atacumap_plot_png: - type: - - "null" - - type: array - items: File + xpr_avg_plot_pdf: + type: File? outputBinding: - glob: "*_xpr_per_cell_rd_atacumap_*.png" + glob: "*_xpr_avg.pdf" doc: | - UMAP, gene expression, ATAC. - PNG format + Average gene expression. + PDF format. - xpr_per_cell_rd_atacumap_plot_pdf: + xpr_per_cell_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_per_cell_rd_atacumap_*.pdf" + glob: "*_xpr_per_cell_*.png" doc: | - UMAP, gene expression, ATAC. - PDF format + UMAP colored by gene expression. + All genes of interest. + PNG format. - xpr_per_cell_rd_wnnumap_plot_png: + xpr_per_cell_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_per_cell_rd_wnnumap_*.png" + glob: "*_xpr_per_cell_*.pdf" doc: | - UMAP, gene expression, WNN. - PNG format + UMAP colored by gene expression. + All genes of interest. + PDF format. - xpr_per_cell_rd_wnnumap_plot_pdf: + xpr_per_cell_sgnl_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_per_cell_rd_wnnumap_*.pdf" + glob: "*_xpr_per_cell_sgnl_*.png" doc: | - UMAP, gene expression, WNN. - PDF format + UMAP colored by gene expression density. + All genes of interest. + PNG format. - xpr_per_cell_sgnl_rd_rnaumap_plot_png: + xpr_per_cell_sgnl_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_per_cell_sgnl_rd_rnaumap_*.png" + glob: "*_xpr_per_cell_sgnl_*.pdf" doc: | - UMAP, gene expression density, RNA. - PNG format + UMAP colored by gene expression density. + All genes of interest. + PDF format. - xpr_per_cell_sgnl_rd_rnaumap_plot_pdf: + xpr_dnst_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_per_cell_sgnl_rd_rnaumap_*.pdf" + glob: "*_xpr_dnst_*.png" doc: | - UMAP, gene expression density, RNA. - PDF format + Gene expression density. + All genes of interest. + PNG format. - xpr_per_cell_sgnl_rd_atacumap_plot_png: + xpr_dnst_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_per_cell_sgnl_rd_atacumap_*.png" + glob: "*_xpr_dnst_*.pdf" doc: | - UMAP, gene expression density, ATAC. - PNG format + Gene expression density. + All genes of interest. + PDF format. - xpr_per_cell_sgnl_rd_atacumap_plot_pdf: - type: - - "null" - - type: array - items: File + xpr_htmp_plot_png: + type: File? outputBinding: - glob: "*_xpr_per_cell_sgnl_rd_atacumap_*.pdf" + glob: "*_xpr_htmp.png" doc: | - UMAP, gene expression density, ATAC. - PDF format + Gene expression heatmap. + Top gene markers. + PNG format. - xpr_per_cell_sgnl_rd_wnnumap_plot_png: - type: - - "null" - - type: array - items: File + xpr_htmp_plot_pdf: + type: File? outputBinding: - glob: "*_xpr_per_cell_sgnl_rd_wnnumap_*.png" + glob: "*_xpr_htmp.pdf" doc: | - UMAP, gene expression density, WNN. - PNG format + Gene expression heatmap. + Top gene markers. + PDF format. - xpr_per_cell_sgnl_rd_wnnumap_plot_pdf: - type: - - "null" - - type: array - items: File + xpr_htmp_tsv: + type: File? outputBinding: - glob: "*_xpr_per_cell_sgnl_rd_wnnumap_*.pdf" + glob: "*_xpr_htmp.tsv" doc: | - UMAP, gene expression density, WNN. - PDF format + Gene expression heatmap. + Top gene markers. + TSV format. cvrg_plot_png: type: @@ -829,8 +736,9 @@ outputs: outputBinding: glob: "*_cvrg_*.png" doc: | - ATAC fragments coverage. - PNG format + ATAC fragment coverage. + All genes of interest. + PNG format. cvrg_plot_pdf: type: @@ -840,104 +748,89 @@ outputs: outputBinding: glob: "*_cvrg_*.pdf" doc: | - ATAC fragments coverage. - PDF format - - xpr_htmp_plot_png: - type: File? - outputBinding: - glob: "*_xpr_htmp.png" - doc: | - Gene expression heatmap. - PNG format - - xpr_htmp_plot_pdf: - type: File? - outputBinding: - glob: "*_xpr_htmp.pdf" - doc: | - Gene expression heatmap. - PDF format - - xpr_htmp_tsv: - type: File? - outputBinding: - glob: "*_xpr_htmp.tsv" - doc: | - Gene markers used for gene - expression heatmap. - TSV format + ATAC fragment coverage. + All genes of interest. + PDF format. gene_markers_tsv: type: File? outputBinding: glob: "*_gene_markers.tsv" doc: | - Differentially expressed genes - between each pair of cell types. - TSV format + Gene markers. + TSV format. peak_markers_tsv: type: File? outputBinding: glob: "*_peak_markers.tsv" doc: | - Differentially accessible peaks - between each pair of cell types. - TSV format + Peak markers. + TSV format. ucsc_cb_config_data: type: Directory? outputBinding: glob: "*_cellbrowser" doc: | - Directory with UCSC Cellbrowser - configuration data. + UCSC Cell Browser configuration data. ucsc_cb_html_data: type: Directory? outputBinding: glob: "*_cellbrowser/html_data" doc: | - Directory with UCSC Cellbrowser - html data. + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputBinding: glob: "*_cellbrowser/html_data/index.html" doc: | - HTML index file from the directory - with UCSC Cellbrowser html data. + UCSC Cell Browser html index. seurat_data_rds: type: File outputBinding: glob: "*_data.rds" doc: | - Reduced Seurat data in RDS format + Seurat object. + RDS format. seurat_data_h5seurat: type: File? outputBinding: glob: "*_data.h5seurat" doc: | - Reduced Seurat data in h5seurat format + Seurat object. + h5Seurat format. + + seurat_rna_data_h5ad: + type: File? + outputBinding: + glob: "*_rna_counts.h5ad" + doc: | + Seurat object. + RNA counts. + H5AD format. - seurat_data_h5ad: + seurat_atac_data_h5ad: type: File? outputBinding: - glob: "*_data.h5ad" + glob: "*_atac_counts.h5ad" doc: | - Reduced Seurat data in h5ad format + Seurat object. + ATAC counts. + H5AD format. seurat_data_scope: type: File? outputBinding: glob: "*_data.loom" doc: | - Reduced Seurat data in SCope - compatible loom format + Seurat object. + SCope compatible. + Loom format. stdout_log: type: stdout @@ -959,8 +852,8 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell Manual Cell Type Assignment" -s:name: "Single-cell Manual Cell Type Assignment" +label: "Single-Cell Manual Cell Type Assignment" +s:name: "Single-Cell Manual Cell Type Assignment" s:alternateName: "Assigns cell types for clusters based on the provided metadata file" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-ctype-assign.cwl @@ -999,18 +892,18 @@ s:creator: doc: | - Single-cell Manual Cell Type Assignment + Single-Cell Manual Cell Type Assignment Assigns cell types for clusters based on the provided metadata file. s:about: | - usage: sc_ctype_assign.R [-h] --query QUERY --celltypes + usage: /usr/local/bin/sc_ctype_assign.R [-h] --query QUERY --celltypes CELLTYPES --source SOURCE --target TARGET [--splitby SPLITBY] - [--diffgenes] [--diffpeaks] - [--rnalogfc RNALOGFC] + [--reduction REDUCTION] [--diffgenes] + [--diffpeaks] [--rnalogfc RNALOGFC] [--rnaminpct RNAMINPCT] [--rnaonlypos] [--rnatestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] [--ataclogfc ATACLOGFC] @@ -1025,17 +918,16 @@ s:about: | [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] + [--seed SEED] - Single-cell Manual Cell Type Assignment + Single-Cell Manual Cell Type Assignment optional arguments: -h, --help show this help message and exit --query QUERY Path to the RDS file to load Seurat object from. This file should include genes expression and/or chromatin accessibility information stored in the RNA and ATAC - assays correspondingly. Additionally, 'rnaumap', - and/or 'atacumap', and/or 'wnnumap' dimensionality - reductions should be present. + assays correspondingly. --celltypes CELLTYPES Path to the TSV/CSV file for manual cell type assignment for each of the clusters. First column - @@ -1049,6 +941,12 @@ s:about: | --splitby SPLITBY Column from the Seurat object metadata to additionally split every cluster selected with --source into smaller groups. Default: do not split + --reduction REDUCTION + Dimensionality reduction to be used in the generated + plots. If not provided it will be automatically + defined on the basis of the --source parameter as + follows: rna_res.* - rnaumap, atac_res.* - atacumap, + wsnn_res.* - wnnumap. Default: defined automatically --diffgenes Identify differentially expressed genes (putative gene markers) for assigned cell types. Ignored if loaded Seurat object doesn't include genes expression @@ -1116,7 +1014,8 @@ s:about: | --pdf Export plots in PDF. Default: false --verbose Print debug information. Default: false --h5seurat Save Seurat data to h5seurat file. Default: false - --h5ad Save Seurat data to h5ad file. Default: false + --h5ad Save raw counts from the RNA and/or ATAC assay(s) to + h5ad file(s). Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --scope Save Seurat data to SCope compatible loom file. Only not normalized raw counts from the RNA assay will be @@ -1127,4 +1026,5 @@ s:about: | Color theme for all generated plots. Default: classic --cpus CPUS Number of cores/cpus to use. Default: 1 --memory MEMORY Maximum memory in GB allowed to be shared between the - workers when using multiple --cpus. Default: 32 \ No newline at end of file + workers when using multiple --cpus. Default: 32 + --seed SEED Seed number for random values. Default: 42 \ No newline at end of file diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index c1fdcea7..d8d37e00 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.33 + dockerPull: biowardrobe2/sc-tools:v0.0.34 inputs: @@ -134,7 +134,8 @@ inputs: doc: | Include cells where at least this many genes are detected. If multiple values provided, each of them will be applied to the correspondent dataset from the - '--mex' input based on the '--identity' file. + '--mex' input based on the '--identity' file. Any 0 will be replaced with the + auto-estimated threshold (median - 2.5 * MAD) calculated per dataset. Default: 250 (applied to all datasets) maximum_genes: @@ -147,7 +148,8 @@ inputs: doc: | Include cells with the number of genes not bigger than this value. If multiple values provided, each of them will be applied to the correspondent dataset from - the '--mex' input based on the '--identity' file. + the '--mex' input based on the '--identity' file. Any 0 will be replaced with the + auto-estimated threshold (median + 5 * MAD) calculated per dataset. Default: 5000 (applied to all datasets) minimum_umis: @@ -161,6 +163,8 @@ inputs: Include cells where at least this many RNA reads are detected. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. + Any 0 will be replaced with the auto-estimated threshold (median - 2.5 * MAD) + calculated per dataset. Default: 500 (applied to all datasets) mito_pattern: @@ -177,7 +181,9 @@ inputs: prefix: "--maxmt" doc: | Include cells with the percentage of RNA reads mapped to mitochondrial - genes not bigger than this value. + genes not bigger than this value. Set to 0 for using an auto-estimated + threshold equal to the maximum among (median + 2 * MAD) values calculated + per dataset. Default: 5 (applied to all datasets) minimum_novelty_score: @@ -213,7 +219,8 @@ inputs: Include cells where at least this many ATAC fragments in peaks are detected. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input - based on the '--identity' file. + based on the '--identity' file. Any 0 will be replaced with the + auto-estimated threshold (median - 2.5 * MAD) calculated per dataset. Default: 1000 (applied to all datasets) maximum_nucl_signal: @@ -400,7 +407,7 @@ inputs: inputBinding: prefix: "--h5ad" doc: | - Save Seurat data to h5ad file. + Save raw counts from the RNA and ATAC assays to h5ad files. Default: false export_ucsc_cb: @@ -442,6 +449,14 @@ inputs: Number of cores/cpus to use. Default: 1 + seed: + type: int? + inputBinding: + prefix: "--seed" + doc: | + Seed number for random values. + Default: 42 + outputs: @@ -541,6 +556,22 @@ outputs: Genes vs RNA reads per cell (not filtered). PDF format + raw_umi_mito_plot_png: + type: File? + outputBinding: + glob: "*_raw_umi_mito.png" + doc: | + RNA reads vs mitochondrial % per cell (not filtered). + PNG format + + raw_umi_mito_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_umi_mito.pdf" + doc: | + RNA reads vs mitochondrial % per cell (not filtered). + PDF format + raw_mito_dnst_plot_png: type: File? outputBinding: @@ -961,6 +992,22 @@ outputs: Genes vs RNA reads per cell (intermediate filtered). PDF format + mid_umi_mito_plot_png: + type: File? + outputBinding: + glob: "*_mid_umi_mito.png" + doc: | + RNA reads vs mitochondrial % per cell (intermediate filtered). + PNG format + + mid_umi_mito_plot_pdf: + type: File? + outputBinding: + glob: "*_mid_umi_mito.pdf" + doc: | + RNA reads vs mitochondrial % per cell (intermediate filtered). + PDF format + mid_fltr_mito_dnst_plot_png: type: File? outputBinding: @@ -1381,6 +1428,22 @@ outputs: Genes vs RNA reads per cell (filtered). PDF format + fltr_umi_mito_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_umi_mito.png" + doc: | + RNA reads vs mitochondrial % per cell (filtered). + PNG format + + fltr_umi_mito_plot_pdf: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_umi_mito.pdf" + doc: | + RNA reads vs mitochondrial % per cell (filtered). + PDF format + fltr_mito_dnst_plot_png: type: File? outputBinding: @@ -1710,28 +1773,29 @@ outputs: outputBinding: glob: "*_cellbrowser" doc: | - Directory with UCSC Cellbrowser configuration data. + UCSC Cell Browser configuration data. ucsc_cb_html_data: type: Directory? outputBinding: glob: "*_cellbrowser/html_data" doc: | - Directory with UCSC Cellbrowser html data. + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputBinding: glob: "*_cellbrowser/html_data/index.html" doc: | - HTML index file from the directory with UCSC Cellbrowser html data. + UCSC Cell Browser html index. seurat_data_rds: type: File outputBinding: glob: "*_data.rds" doc: | - Filtered Seurat data in RDS format + Seurat object. + RDS format datasets_metadata: type: File @@ -1746,14 +1810,26 @@ outputs: outputBinding: glob: "*_data.h5seurat" doc: | - Filtered Seurat data in h5seurat format + Seurat object. + h5Seurat format + + seurat_rna_data_h5ad: + type: File? + outputBinding: + glob: "*_rna_counts.h5ad" + doc: | + Seurat object. + RNA counts. + H5AD format. - seurat_data_h5ad: + seurat_atac_data_h5ad: type: File? outputBinding: - glob: "*_data.h5ad" + glob: "*_atac_counts.h5ad" doc: | - Reduced Seurat data in h5ad format + Seurat object. + ATAC counts. + H5AD format. stdout_log: type: stdout @@ -1786,8 +1862,8 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell Multiome ATAC and RNA-Seq Filtering Analysis" -s:name: "Single-cell Multiome ATAC and RNA-Seq Filtering Analysis" +label: "Single-Cell Multiome ATAC-Seq and RNA-Seq Filtering Analysis" +s:name: "Single-Cell Multiome ATAC-Seq and RNA-Seq Filtering Analysis" s:alternateName: "Filters single-cell multiome ATAC and RNA-Seq datasets based on the common QC metrics" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-multiome-filter.cwl @@ -1826,13 +1902,13 @@ s:creator: doc: | - Single-cell Multiome ATAC and RNA-Seq Filtering Analysis + Single-Cell Multiome ATAC-Seq and RNA-Seq Filtering Analysis Filters single-cell multiome ATAC and RNA-Seq datasets based on the common QC metrics. s:about: | - usage: sc_multiome_filter.R [-h] --mex MEX --identity IDENTITY + usage: /usr/local/bin/sc_multiome_filter.R [-h] --mex MEX --identity IDENTITY --fragments FRAGMENTS --annotations ANNOTATIONS --seqinfo SEQINFO [--grouping GROUPING] @@ -1862,8 +1938,9 @@ s:about: | [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] + [--seed SEED] - Single-cell Multiome ATAC and RNA-Seq Filtering Analysis + Single-Cell Multiome ATAC-Seq and RNA-Seq Filtering Analysis optional arguments: -h, --help show this help message and exit @@ -1909,27 +1986,34 @@ s:about: | Include cells where at least this many genes are detected. If multiple values provided, each of them will be applied to the correspondent dataset from the - '--mex' input based on the '--identity' file. Default: + '--mex' input based on the '--identity' file. Any 0 + will be replaced with the auto-estimated threshold + (median - 2.5 * MAD) calculated per dataset. Default: 250 (applied to all datasets) --maxgenes [MAXGENES [MAXGENES ...]] Include cells with the number of genes not bigger than this value. If multiple values provided, each of them will be applied to the correspondent dataset from the - '--mex' input based on the '--identity' file. Default: + '--mex' input based on the '--identity' file. Any 0 + will be replaced with the auto-estimated threshold + (median + 5 * MAD) calculated per dataset. Default: 5000 (applied to all datasets) --minumis [MINUMIS [MINUMIS ...]] - Include cells where at least this many RNA reads - are detected. If multiple values - provided, each of them will be applied to the - correspondent dataset from the '--mex' input based on - the '--identity' file. Default: 500 (applied to all - datasets) + Include cells where at least this many RNA reads are + detected. If multiple values provided, each of them + will be applied to the correspondent dataset from the + '--mex' input based on the '--identity' file. Any 0 + will be replaced with the auto-estimated threshold + (median - 2.5 * MAD) calculated per dataset. Default: + 500 (applied to all datasets) --mitopattern MITOPATTERN Regex pattern to identify mitochondrial genes. Default: '^mt-|^MT-' - --maxmt MAXMT Include cells with the percentage of RNA reads - mapped to mitochondrial genes not bigger than this - value. Default: 5 (applied to all datasets) + --maxmt MAXMT Include cells with the percentage of RNA reads mapped + to mitochondrial genes not bigger than this value. Set + to 0 for using an auto-estimated threshold equal to + the maximum among (median + 2 * MAD) values calculated + per dataset. Default: 5 (applied to all datasets) --minnovelty [MINNOVELTY [MINNOVELTY ...]] Include cells with the novelty score not lower than this value, calculated for as log10(genes)/log10(UMI) @@ -1941,34 +2025,37 @@ s:about: | Include only peaks detected in at least this many cells. Default: 5 (applied to all datasets) --minfragments [MINFRAGMENTS [MINFRAGMENTS ...]] - Include cells where at least this many ATAC fragments in - peaks are detected. If multiple values provided, each - of them will be applied to the correspondent dataset - from the '--mex' input based on the '--identity' file. - Default: 1000 (applied to all datasets) + Include cells where at least this many ATAC fragments + in peaks are detected. If multiple values provided, + each of them will be applied to the correspondent + dataset from the '--mex' input based on the '-- + identity' file. Any 0 will be replaced with the auto- + estimated threshold (median - 2.5 * MAD) calculated + per dataset. Default: 1000 (applied to all datasets) --maxnuclsignal [MAXNUCLSIGNAL [MAXNUCLSIGNAL ...]] Include cells with the nucleosome signal not bigger than this value. Nucleosome signal quantifies the approximate ratio of mononucleosomal to nucleosome- - free ATAC fragments. If multiple values provided, each of - them will be applied to the correspondent dataset from - the '--mex' input based on the '--identity' file. + free ATAC fragments. If multiple values provided, each + of them will be applied to the correspondent dataset + from the '--mex' input based on the '--identity' file. Default: 4 (applied to all datasets) --mintssenrich [MINTSSENRICH [MINTSSENRICH ...]] Include cells with the TSS enrichment score not lower than this value. Score is calculated based on the - ratio of ATAC fragments centered at the TSS to ATAC fragments in - TSS-flanking regions. If multiple values provided, - each of them will be applied to the correspondent - dataset from the '--mex' input based on the '-- - identity' file. Default: 2 (applied to all datasets) + ratio of ATAC fragments centered at the TSS to ATAC + fragments in TSS-flanking regions. If multiple values + provided, each of them will be applied to the + correspondent dataset from the '--mex' input based on + the '--identity' file. Default: 2 (applied to all + datasets) --minfrip [MINFRIP [MINFRIP ...]] Include cells with the FRiP not lower than this value. If multiple values provided, each of them will be applied to the correspondent dataset from the '--mex' input based on the '--identity' file. FRiP is - calculated for ATAC fragments. Default: 0.15 (applied to - all datasets) + calculated for ATAC fragments. Default: 0.15 (applied + to all datasets) --maxblacklist [MAXBLACKLIST [MAXBLACKLIST ...]] Include cells with the fraction of ATAC fragments in genomic blacklist regions not bigger than this value. @@ -2009,7 +2096,8 @@ s:about: | --pdf Export plots in PDF. Default: false --verbose Print debug information. Default: false --h5seurat Save Seurat data to h5seurat file. Default: false - --h5ad Save Seurat data to h5ad file. Default: false + --h5ad Save raw counts from the RNA and ATAC assays to h5ad + files. Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --tmpdir TMPDIR Directory to keep temporary files. Default: either /tmp or defined by environment variables TMPDIR, TMP, @@ -2019,4 +2107,5 @@ s:about: | Color theme for all generated plots. Default: classic --cpus CPUS Number of cores/cpus to use. Default: 1 --memory MEMORY Maximum memory in GB allowed to be shared between the - workers when using multiple '--cpus'. Default: 32 \ No newline at end of file + workers when using multiple '--cpus'. Default: 32 + --seed SEED Seed number for random values. Default: 42 \ No newline at end of file diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index 05ea0a20..e7b96262 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.33 + dockerPull: biowardrobe2/sc-tools:v0.0.34 inputs: @@ -31,7 +31,8 @@ inputs: prefix: "--dimensions" doc: | Dimensionality to use when constructing nearest-neighbor - graph before clustering (from 1 to 50). + graph before clustering (from 1 to 50). Set to 0 to use + auto-estimated dimensionality. Default: 10 cluster_metric: @@ -198,7 +199,7 @@ inputs: inputBinding: prefix: "--h5ad" doc: | - Save Seurat data to h5ad file. + Save raw counts from the RNA assay to h5ad file. Default: false export_scope_data: @@ -248,76 +249,174 @@ inputs: Number of cores/cpus to use. Default: 1 + seed: + type: int? + inputBinding: + prefix: "--seed" + doc: | + Seed number for random values. + Default: 42 + outputs: - umap_res_plot_png: + umap_gr_ph_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_ph_spl_idnt.png" + doc: | + UMAP colored by cell cycle phase. + Split by dataset; downsampled to the + smallest dataset. + PNG format. + + umap_gr_ph_spl_idnt_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_gr_ph_spl_idnt.pdf" + doc: | + UMAP colored by cell cycle phase. + Split by dataset; downsampled to the + smallest dataset. + PDF format. + + cmp_gr_ph_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_cmp_gr_ph_spl_idnt.png" + doc: | + Composition plot colored by cell cycle phase. + Split by dataset; downsampled to the smallest + dataset. + PNG format + + cmp_gr_ph_spl_idnt_plot_pdf: + type: File? + outputBinding: + glob: "*_cmp_gr_ph_spl_idnt.pdf" + doc: | + Composition plot colored by cell cycle phase. + Split by dataset; downsampled to the smallest + dataset. + PDF format + + umap_gr_ph_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_ph_spl_cnd.png" + doc: | + UMAP colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + + umap_gr_ph_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_gr_ph_spl_cnd.pdf" + doc: | + UMAP colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PDF format. + + cmp_gr_ph_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_cmp_gr_ph_spl_cnd.png" + doc: | + Composition plot colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + + cmp_gr_ph_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_cmp_gr_ph_spl_cnd.pdf" + doc: | + Composition plot colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PDF format. + + umap_gr_clst_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_res_*.png" + glob: "*_umap_gr_clst_res_*.png" doc: | - UMAP, colored by cluster. + UMAP colored by cluster. + All cells; all resolutions. PNG format - umap_res_plot_pdf: + umap_gr_clst_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_res_*.pdf" + glob: "*_umap_gr_clst_res_*.pdf" doc: | - UMAP, colored by cluster. + UMAP colored by cluster. + All cells; all resolutions. PDF format - slh_res_plot_png: + slh_gr_clst_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_slh_res_*.png" + glob: "*_slh_gr_clst_res_*.png" doc: | Silhouette scores. + All cells; all resolutions. PNG format - slh_res_plot_pdf: + slh_gr_clst_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_slh_res_*.pdf" + glob: "*_slh_gr_clst_res_*.pdf" doc: | Silhouette scores. + All cells; all resolutions. PDF format - umap_spl_idnt_res_plot_png: + umap_gr_clst_spl_idnt_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_spl_idnt_res_*.png" + glob: "*_umap_gr_clst_spl_idnt_res_*.png" doc: | - UMAP, colored by cluster, - split by dataset. - PNG format + UMAP colored by cluster. + Split by dataset; downsampled to the + smallest dataset; all resolutions. + PNG format. - umap_spl_idnt_res_plot_pdf: + umap_gr_clst_spl_idnt_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_spl_idnt_res_*.pdf" + glob: "*_umap_gr_clst_spl_idnt_res_*.pdf" doc: | - UMAP, colored by cluster, - split by dataset. - PDF format + UMAP colored by cluster. + Split by dataset; downsampled to the + smallest dataset; all resolutions. + PDF format. cmp_gr_clst_spl_idnt_res_plot_png: type: @@ -327,10 +426,10 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_idnt_res_*.png" doc: | - Composition plot, colored by - cluster, split by dataset, - downsampled. - PNG format + Composition plot colored by cluster. + Split by dataset; downsampled to the + smallest dataset; all resolutions. + PNG format. cmp_gr_clst_spl_idnt_res_plot_pdf: type: @@ -340,10 +439,10 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_idnt_res_*.pdf" doc: | - Composition plot, colored by - cluster, split by dataset, - downsampled. - PDF format + Composition plot colored by cluster. + Split by dataset; downsampled to the + smallest dataset; all resolutions. + PDF format. cmp_gr_idnt_spl_clst_res_plot_png: type: @@ -353,10 +452,10 @@ outputs: outputBinding: glob: "*_cmp_gr_idnt_spl_clst_res_*.png" doc: | - Composition plot, colored by - dataset, split by cluster, - downsampled. - PNG format + Composition plot colored by dataset. + Split by cluster; downsampled to the + smallest dataset; all resolutions. + PNG format. cmp_gr_idnt_spl_clst_res_plot_pdf: type: @@ -366,222 +465,224 @@ outputs: outputBinding: glob: "*_cmp_gr_idnt_spl_clst_res_*.pdf" doc: | - Composition plot, colored by - dataset, split by cluster, - downsampled. - PDF format + Composition plot colored by dataset. + Split by cluster; downsampled to the + smallest dataset; all resolutions. + PDF format. - umap_spl_cnd_res_plot_png: + umap_gr_clst_spl_ph_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_spl_cnd_res_*.png" + glob: "*_umap_gr_clst_spl_ph_res_*.png" doc: | - UMAP, colored by cluster, - split by grouping condition. - PNG format + UMAP colored by cluster. + Split by cell cycle phase; downsampled + to the smallest dataset (if multiple + datasets are analyzed jointly); all + resolutions. + PNG format. - umap_spl_cnd_res_plot_pdf: + umap_gr_clst_spl_ph_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_spl_cnd_res_*.pdf" + glob: "*_umap_gr_clst_spl_ph_res_*.pdf" doc: | - UMAP, colored by cluster, - split by grouping condition. - PDF format + UMAP colored by cluster. + Split by cell cycle phase; downsampled + to the smallest dataset (if multiple + datasets are analyzed jointly); all + resolutions. + PDF format. - cmp_gr_clst_spl_cnd_res_plot_png: + cmp_gr_ph_spl_clst_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_cmp_gr_clst_spl_cnd_res_*.png" + glob: "*_cmp_gr_ph_spl_clst_res_*.png" doc: | - Composition plot, colored by - cluster, split by grouping - condition, downsampled. + Composition plot colored by cell cycle phase. + Split by cluster; downsampled to the smallest + dataset (if multiple datasets are analyzed + jointly); all resolutions. PNG format - cmp_gr_clst_spl_cnd_res_plot_pdf: + cmp_gr_ph_spl_clst_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_cmp_gr_clst_spl_cnd_res_*.pdf" + glob: "*_cmp_gr_ph_spl_clst_res_*.pdf" doc: | - Composition plot, colored by - cluster, split by grouping - condition, downsampled. + Composition plot colored by cell cycle phase. + Split by cluster; downsampled to the smallest + dataset (if multiple datasets are analyzed + jointly); all resolutions. PDF format - cmp_gr_cnd_spl_clst_res_plot_png: + umap_gr_clst_spl_cnd_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_cmp_gr_cnd_spl_clst_res_*.png" + glob: "*_umap_gr_clst_spl_cnd_res_*.png" doc: | - Composition plot, colored by - grouping condition, split by - cluster, downsampled. - PNG format + UMAP colored by cluster. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group; all resolutions. + PNG format. - cmp_gr_cnd_spl_clst_res_plot_pdf: + umap_gr_clst_spl_cnd_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_cmp_gr_cnd_spl_clst_res_*.pdf" + glob: "*_umap_gr_clst_spl_cnd_res_*.pdf" doc: | - Composition plot, colored by - grouping condition, split by - cluster, downsampled. - PDF format + UMAP colored by cluster. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group; all resolutions. + PDF format. - umap_spl_ph_res_plot_png: + cmp_gr_clst_spl_cnd_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_spl_ph_res_*.png" + glob: "*_cmp_gr_clst_spl_cnd_res_*.png" doc: | - UMAP, colored by cluster, - split by cell cycle phase. - PNG format + Composition plot colored by cluster. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group; all resolutions. + PNG format. - umap_spl_ph_res_plot_pdf: + cmp_gr_clst_spl_cnd_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_spl_ph_res_*.pdf" - doc: | - UMAP, colored by cluster, - split by cell cycle phase. - PDF format - - cmp_gr_ph_spl_idnt_plot_png: - type: File? - outputBinding: - glob: "*_cmp_gr_ph_spl_idnt.png" - doc: | - Composition plot, colored by - cell cycle phase, split by - dataset, downsampled. - PNG format - - cmp_gr_ph_spl_idnt_plot_pdf: - type: File? - outputBinding: - glob: "*_cmp_gr_ph_spl_idnt.pdf" + glob: "*_cmp_gr_clst_spl_cnd_res_*.pdf" doc: | - Composition plot, colored by - cell cycle phase, split by - dataset, downsampled. - PDF format + Composition plot colored by cluster. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group; all resolutions. + PDF format. - cmp_gr_ph_spl_clst_res_plot_png: + cmp_gr_cnd_spl_clst_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_cmp_gr_ph_spl_clst_res_*.png" + glob: "*_cmp_gr_cnd_spl_clst_res_*.png" doc: | - Composition plot, colored by - cell cycle phase, split by - cluster, downsampled. - PNG format + Composition plot colored by grouping condition. + Split by cluster; first downsampled to the + smallest dataset, then downsampled to the + smallest group; all resolutions. + PNG format. - cmp_gr_ph_spl_clst_res_plot_pdf: + cmp_gr_cnd_spl_clst_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_cmp_gr_ph_spl_clst_res_*.pdf" + glob: "*_cmp_gr_cnd_spl_clst_res_*.pdf" doc: | - Composition plot, colored by - cell cycle phase, split by - cluster, downsampled. - PDF format + Composition plot colored by grouping condition. + Split by cluster; first downsampled to the + smallest dataset, then downsampled to the + smallest group; all resolutions. + PDF format. - xpr_avg_res_plot_png: + xpr_per_cell_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_avg_res_*.png" + glob: "*_xpr_per_cell_*.png" doc: | - Gene expression dot plot. - PNG format + UMAP colored by gene expression. + All genes of interest. + PNG format. - xpr_avg_res_plot_pdf: + xpr_per_cell_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_avg_res_*.pdf" + glob: "*_xpr_per_cell_*.pdf" doc: | - Gene expression dot plot. - PDF format + UMAP colored by gene expression. + All genes of interest. + PDF format. - xpr_per_cell_plot_png: + xpr_per_cell_sgnl_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_per_cell_[!sgnl_]*.png" + glob: "*_xpr_per_cell_sgnl_*.png" doc: | - UMAP, gene expression. - PNG format + UMAP colored by gene expression density. + All genes of interest. + PNG format. - xpr_per_cell_plot_pdf: + xpr_per_cell_sgnl_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_per_cell_[!sgnl_]*.pdf" + glob: "*_xpr_per_cell_sgnl_*.pdf" doc: | - UMAP, gene expression. - PDF format + UMAP colored by gene expression density. + All genes of interest. + PDF format. - xpr_per_cell_sgnl_plot_png: + xpr_avg_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_per_cell_sgnl_*.png" + glob: "*_xpr_avg_res_*.png" doc: | - UMAP, gene expression density. - PNG format + Average gene expression. + All resolutions. + PNG format. - xpr_per_cell_sgnl_plot_pdf: + xpr_avg_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_per_cell_sgnl_*.pdf" + glob: "*_xpr_avg_res_*.pdf" doc: | - UMAP, gene expression density. - PDF format + Average gene expression. + All resolutions. + PDF format. xpr_dnst_res_plot_png: type: @@ -591,8 +692,9 @@ outputs: outputBinding: glob: "*_xpr_dnst_res_*.png" doc: | - Gene expression violin plot. - PNG format + Gene expression density. + All genes of interest; all resolutions. + PNG format. xpr_dnst_res_plot_pdf: type: @@ -602,8 +704,9 @@ outputs: outputBinding: glob: "*_xpr_dnst_res_*.pdf" doc: | - Gene expression violin plot. - PDF format + Gene expression density. + All genes of interest; all resolutions. + PDF format. xpr_htmp_res_plot_png: type: @@ -614,7 +717,8 @@ outputs: glob: "*_xpr_htmp_res_*.png" doc: | Gene expression heatmap. - PNG format + Top gene markers; all resolutions. + PNG format. xpr_htmp_res_plot_pdf: type: @@ -625,7 +729,8 @@ outputs: glob: "*_xpr_htmp_res_*.pdf" doc: | Gene expression heatmap. - PDF format + Top gene markers; all resolutions. + PDF format. xpr_htmp_res_tsv: type: @@ -635,73 +740,72 @@ outputs: outputBinding: glob: "*_xpr_htmp_res_*.tsv" doc: | - Gene markers used for gene - expression heatmap. - TSV format + Gene expression heatmap. + Top gene markers; all resolutions. + TSV format. gene_markers_tsv: type: File? outputBinding: glob: "*_gene_markers.tsv" doc: | - Gene markers per cluster for - all resolutions. - TSV format + Gene markers. + All resolutions. + TSV format. ucsc_cb_config_data: type: Directory? outputBinding: glob: "*_cellbrowser" doc: | - Directory with UCSC Cellbrowser - configuration data. + UCSC Cell Browser configuration data. ucsc_cb_html_data: type: Directory? outputBinding: glob: "*_cellbrowser/html_data" doc: | - Directory with UCSC Cellbrowser - html data. + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputBinding: glob: "*_cellbrowser/html_data/index.html" doc: | - HTML index file from the directory - with UCSC Cellbrowser html data. + UCSC Cell Browser html index. seurat_data_rds: type: File outputBinding: glob: "*_data.rds" doc: | - Reduced Seurat data in RDS format + Seurat object. + RDS format seurat_data_h5seurat: type: File? outputBinding: glob: "*_data.h5seurat" doc: | - Reduced Seurat data in - h5seurat format + Seurat object. + h5Seurat format seurat_data_h5ad: type: File? outputBinding: - glob: "*_data.h5ad" + glob: "*_counts.h5ad" doc: | - Reduced Seurat data in - h5ad format + Seurat object. + H5AD format seurat_data_scope: type: File? outputBinding: glob: "*_data.loom" doc: | - Reduced Seurat data in - SCope compatible loom format + Seurat object. + SCope compatible. + Loom format stdout_log: type: stdout @@ -723,8 +827,8 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell RNA-Seq Cluster Analysis" -s:name: "Single-cell RNA-Seq Cluster Analysis" +label: "Single-Cell RNA-Seq Cluster Analysis" +s:name: "Single-Cell RNA-Seq Cluster Analysis" s:alternateName: "Clusters single-cell RNA-Seq datasets, identifies gene markers" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-rna-cluster.cwl @@ -763,13 +867,13 @@ s:creator: doc: | - Single-cell RNA-Seq Cluster Analysis + Single-Cell RNA-Seq Cluster Analysis Clusters single-cell RNA-Seq datasets, identifies gene markers. s:about: | - usage: sc_rna_cluster.R [-h] --query QUERY + usage: /usr/local/bin/sc_rna_cluster.R [-h] --query QUERY [--dimensions DIMENSIONS] [--ametric {euclidean,cosine,manhattan,hamming}] [--algorithm {louvain,mult-louvain,slm,leiden}] @@ -783,8 +887,9 @@ s:about: | [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] + [--seed SEED] - Single-cell RNA-Seq Cluster Analysis + Single-Cell RNA-Seq Cluster Analysis optional arguments: -h, --help show this help message and exit @@ -795,8 +900,8 @@ s:about: | assay. --dimensions DIMENSIONS Dimensionality to use when constructing nearest- - neighbor graph before clustering (from 1 to 50). - Default: 10 + neighbor graph before clustering (from 1 to 50). Set + to 0 to use auto-estimated dimensionality. Default: 10 --ametric {euclidean,cosine,manhattan,hamming} Distance metric used when constructing nearest- neighbor graph before clustering. Default: euclidean @@ -836,7 +941,8 @@ s:about: | --pdf Export plots in PDF. Default: false --verbose Print debug information. Default: false --h5seurat Save Seurat data to h5seurat file. Default: false - --h5ad Save Seurat data to h5ad file. Default: false + --h5ad Save raw counts from the RNA assay to h5ad file. + Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --scope Save Seurat data to SCope compatible loom file. Default: false @@ -845,4 +951,5 @@ s:about: | Color theme for all generated plots. Default: classic --cpus CPUS Number of cores/cpus to use. Default: 1 --memory MEMORY Maximum memory in GB allowed to be shared between the - workers when using multiple --cpus. Default: 32 \ No newline at end of file + workers when using multiple --cpus. Default: 32 + --seed SEED Seed number for random values. Default: 42 \ No newline at end of file diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index e204cf2a..e4ce986a 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.33 + dockerPull: biowardrobe2/sc-tools:v0.0.34 inputs: @@ -163,7 +163,7 @@ inputs: inputBinding: prefix: "--h5ad" doc: | - Save Seurat data to h5ad file. + Save raw counts from the RNA assay to h5ad file. Default: false export_ucsc_cb: @@ -205,6 +205,14 @@ inputs: Number of cores/cpus to use. Default: 1 + seed: + type: int? + inputBinding: + prefix: "--seed" + doc: | + Seed number for random values. + Default: 42 + outputs: @@ -423,42 +431,45 @@ outputs: outputBinding: glob: "*_cellbrowser" doc: | - Directory with UCSC Cellbrowser configuration data. + UCSC Cell Browser configuration data. ucsc_cb_html_data: type: Directory? outputBinding: glob: "*_cellbrowser/html_data" doc: | - Directory with UCSC Cellbrowser html data. + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputBinding: glob: "*_cellbrowser/html_data/index.html" doc: | - HTML index file from the directory with UCSC Cellbrowser html data. + UCSC Cell Browser html index. seurat_data_rds: type: File outputBinding: glob: "*_data.rds" doc: | - Reduced Seurat data in RDS format + Seurat object. + RDS format seurat_data_h5seurat: type: File? outputBinding: glob: "*_data.h5seurat" doc: | - Reduced Seurat data in h5seurat format + Seurat object. + h5Seurat format seurat_data_h5ad: type: File? outputBinding: - glob: "*_data.h5ad" + glob: "*_counts.h5ad" doc: | - Reduced Seurat data in h5ad format + Seurat object. + H5AD format stdout_log: type: stdout @@ -480,8 +491,8 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell Differential Abundance Analysis" -s:name: "Single-cell Differential Abundance Analysis" +label: "Single-Cell Differential Abundance Analysis" +s:name: "Single-Cell Differential Abundance Analysis" s:alternateName: "Detects cell subpopulations with differential abundance between datasets split by biological condition" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-rna-da-cells.cwl @@ -520,14 +531,14 @@ s:creator: doc: | - Single-cell Differential Abundance Analysis + Single-Cell Differential Abundance Analysis Detects cell subpopulations with differential abundance between datasets split by biological condition. s:about: | - usage: sc_rna_da_cells.R [-h] --query QUERY + usage: /usr/local/bin/sc_rna_da_cells.R [-h] --query QUERY [--reduction REDUCTION] [--dimensions DIMENSIONS] [--knn [KNN [KNN ...]]] @@ -539,8 +550,9 @@ s:about: | [--cbbuild] [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] + [--seed SEED] - Single-cell Differential Abundance Analysis + Single-Cell Differential Abundance Analysis optional arguments: -h, --help show this help message and exit @@ -590,11 +602,13 @@ s:about: | --pdf Export plots in PDF. Default: false --verbose Print debug information. Default: false --h5seurat Save Seurat data to h5seurat file. Default: false - --h5ad Save Seurat data to h5ad file. Default: false + --h5ad Save raw counts from the RNA assay to h5ad file. + Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --output OUTPUT Output prefix. Default: ./sc --theme {gray,bw,linedraw,light,dark,minimal,classic,void} Color theme for all generated plots. Default: classic --cpus CPUS Number of cores/cpus to use. Default: 1 --memory MEMORY Maximum memory in GB allowed to be shared between the - workers when using multiple --cpus. Default: 32 \ No newline at end of file + workers when using multiple --cpus. Default: 32 + --seed SEED Seed number for random values. Default: 42 \ No newline at end of file diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index 58a9d7ed..87dbba0f 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.33 + dockerPull: biowardrobe2/sc-tools:v0.0.34 inputs: @@ -322,6 +322,14 @@ inputs: Number of cores/cpus to use. Default: 1 + seed: + type: int? + inputBinding: + prefix: "--seed" + doc: | + Seed number for random values. + Default: 42 + outputs: @@ -631,8 +639,8 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell Differential Expression Analysis" -s:name: "Single-cell Differential Expression Analysis" +label: "Single-Cell RNA-Seq Differential Expression Analysis" +s:name: "Single-Cell RNA-Seq Differential Expression Analysis" s:alternateName: "Identifies differentially expressed genes between two groups of cells optionally coerced to pseudobulk form" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-rna-de-pseudobulk.cwl @@ -671,29 +679,36 @@ s:creator: doc: | - Single-cell Differential Expression Analysis + Single-Cell RNA-Seq Differential Expression Analysis Identifies differentially expressed genes between two groups of cells optionally coerced to pseudobulk form s:about: | - usage: sc_rna_de_pseudobulk.R - [-h] --query QUERY [--metadata METADATA] [--barcodes BARCODES] - [--groupby GROUPBY] [--subset [SUBSET ...]] --splitby SPLITBY --first - FIRST --second SECOND - [--test {wilcoxon,likelihood-ratio,t-test,negative-binomial,poisson,logistic-regression,mast,deseq,deseq-lrt}] - [--batchby BATCHBY] [--padj PADJ] [--genes [GENES ...]] - [--exclude EXCLUDE] [--cluster {row,column,both}] - [--rowdist {cosangle,abscosangle,euclid,abseuclid,cor,abscor}] - [--columndist {cosangle,abscosangle,euclid,abseuclid,cor,abscor}] - [--center] [--pdf] [--verbose] [--output OUTPUT] - [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] - [--cpus CPUS] [--memory MEMORY] - - Single-cell Differential Expression Analysis - - options: + usage: /usr/local/bin/sc_rna_de_pseudobulk.R [-h] --query QUERY + [--metadata METADATA] + [--barcodes BARCODES] + [--groupby GROUPBY] + [--subset [SUBSET [SUBSET ...]]] + --splitby SPLITBY --first FIRST + --second SECOND + [--test {wilcoxon,likelihood-ratio,t-test,negative-binomial,poisson,logistic-regression,mast,deseq,deseq-lrt}] + [--batchby BATCHBY] [--padj PADJ] + [--genes [GENES [GENES ...]]] + [--exclude EXCLUDE] + [--cluster {row,column,both}] + [--rowdist {cosangle,abscosangle,euclid,abseuclid,cor,abscor}] + [--columndist {cosangle,abscosangle,euclid,abseuclid,cor,abscor}] + [--center] [--pdf] [--verbose] + [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + [--seed SEED] + + Single-Cell RNA-Seq Differential Expression Analysis + + optional arguments: -h, --help show this help message and exit --query QUERY Path to the RDS file to load Seurat object from. This file should include genes expression information @@ -723,7 +738,7 @@ s:about: | added with --metadata or --barcodes parameters. Ignored if --subset is not set. Default: do not subset, include all cells into analysis. - --subset [SUBSET ...] + --subset [SUBSET [SUBSET ...]] Values from the column set with --groupby parameter to subset cells before running differential expression analysis. Ignored if --groupby is not provided. @@ -768,7 +783,8 @@ s:about: | output only differentially expressed genes with adjusted P-value not bigger than this value. Default: 0.05 - --genes [GENES ...] Genes of interest to label on the generated plots. + --genes [GENES [GENES ...]] + Genes of interest to label on the generated plots. Default: top 10 genes with the highest and the lowest log2FoldChange values. --exclude EXCLUDE Regex pattern to identify and exclude specific genes @@ -800,4 +816,5 @@ s:about: | Color theme for all generated plots. Default: classic --cpus CPUS Number of cores/cpus to use. Default: 1 --memory MEMORY Maximum memory in GB allowed to be shared between the - workers when using multiple --cpus. Default: 32 \ No newline at end of file + workers when using multiple --cpus. Default: 32 + --seed SEED Seed number for random values. Default: 42 \ No newline at end of file diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index e7c68712..40ea5934 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.33 + dockerPull: biowardrobe2/sc-tools:v0.0.34 inputs: @@ -79,7 +79,8 @@ inputs: doc: | Include cells where at least this many genes are detected. If multiple values provided, each of them will be applied to the correspondent dataset from the - '--mex' input based on the '--identity' file. + '--mex' input based on the '--identity' file. Any 0 will be replaced with the + auto-estimated threshold (median - 2.5 * MAD) calculated per dataset. Default: 250 (applied to all datasets) maximum_genes: @@ -92,7 +93,8 @@ inputs: doc: | Include cells with the number of genes not bigger than this value. If multiple values provided, each of them will be applied to the correspondent dataset from - the '--mex' input based on the '--identity' file. + the '--mex' input based on the '--identity' file. Any 0 will be replaced with the + auto-estimated threshold (median + 5 * MAD) calculated per dataset. Default: 5000 (applied to all datasets) minimum_umis: @@ -105,7 +107,9 @@ inputs: doc: | Include cells where at least this many RNA reads are detected. If multiple values provided, each of them will be applied to the correspondent - dataset from the '--mex' input based on the '--identity' file. + dataset from the '--mex' input based on the '--identity' file. Any 0 will be + replaced with the auto-estimated threshold (median - 2.5 * MAD) calculated + per dataset. Default: 500 (applied to all datasets) minimum_novelty_score: @@ -136,7 +140,9 @@ inputs: prefix: "--maxmt" doc: | Include cells with the percentage of RNA reads mapped to mitochondrial - genes not bigger than this value. + genes not bigger than this value. Set to 0 for using an auto-estimated + threshold equal to the maximum among (median + 2 * MAD) values + calculated per dataset. Default: 5 (applied to all datasets) remove_doublets: @@ -216,7 +222,7 @@ inputs: inputBinding: prefix: "--h5ad" doc: | - Save Seurat data to h5ad file. + Save raw counts from the RNA assay to h5ad file. Default: false export_ucsc_cb: @@ -258,6 +264,14 @@ inputs: Number of cores/cpus to use. Default: 1 + seed: + type: int? + inputBinding: + prefix: "--seed" + doc: | + Seed number for random values. + Default: 42 + outputs: @@ -357,6 +371,22 @@ outputs: Genes vs RNA reads per cell correlation (not filtered). PDF format + raw_umi_mito_plot_png: + type: File? + outputBinding: + glob: "*_raw_umi_mito.png" + doc: | + RNA reads vs mitochondrial % per cell (not filtered). + PNG format + + raw_umi_mito_plot_pdf: + type: File? + outputBinding: + glob: "*_raw_umi_mito.pdf" + doc: | + RNA reads vs mitochondrial % per cell (not filtered). + PDF format + raw_mito_dnst_plot_png: type: File? outputBinding: @@ -583,6 +613,22 @@ outputs: Genes vs RNA reads per cell correlation (filtered). PDF format + fltr_umi_mito_plot_png: + type: File? + outputBinding: + glob: "*_fltr_umi_mito.png" + doc: | + RNA reads vs mitochondrial % per cell (filtered). + PNG format + + fltr_umi_mito_plot_pdf: + type: File? + outputBinding: + glob: "*_fltr_umi_mito.pdf" + doc: | + RNA reads vs mitochondrial % per cell (filtered). + PDF format + fltr_mito_dnst_plot_png: type: File? outputBinding: @@ -718,28 +764,29 @@ outputs: outputBinding: glob: "*_cellbrowser" doc: | - Directory with UCSC Cellbrowser configuration data. + UCSC Cell Browser configuration data. ucsc_cb_html_data: type: Directory? outputBinding: glob: "*_cellbrowser/html_data" doc: | - Directory with UCSC Cellbrowser html data. + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputBinding: glob: "*_cellbrowser/html_data/index.html" doc: | - HTML index file from the directory with UCSC Cellbrowser html data. + UCSC Cell Browser html index. seurat_data_rds: type: File outputBinding: glob: "*_data.rds" doc: | - Filtered Seurat data in RDS format + Seurat object. + RDS format datasets_metadata: type: File @@ -754,14 +801,16 @@ outputs: outputBinding: glob: "*_data.h5seurat" doc: | - Filtered Seurat data in h5seurat format + Seurat object. + h5Seurat format seurat_data_h5ad: type: File? outputBinding: - glob: "*_data.h5ad" + glob: "*_counts.h5ad" doc: | - Reduced Seurat data in h5ad format + Seurat object. + H5AD format stdout_log: type: stdout @@ -794,8 +843,8 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell RNA-Seq Filtering Analysis" -s:name: "Single-cell RNA-Seq Filtering Analysis" +label: "Single-Cell RNA-Seq Filtering Analysis" +s:name: "Single-Cell RNA-Seq Filtering Analysis" s:alternateName: "Filters single-cell RNA-Seq datasets based on the common QC metrics" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-rna-filter.cwl @@ -834,27 +883,30 @@ s:creator: doc: | - Single-cell RNA-Seq Filtering Analysis + Single-Cell RNA-Seq Filtering Analysis Filters single-cell RNA-Seq datasets based on the common QC metrics. s:about: | - usage: sc_rna_filter.R [-h] --mex MEX [MEX ...] --identity IDENTITY - [--grouping GROUPING] [--barcodes BARCODES] - [--rnamincells RNAMINCELLS] - [--mingenes [MINGENES [MINGENES ...]]] - [--maxgenes [MAXGENES [MAXGENES ...]]] - [--minumis [MINUMIS [MINUMIS ...]]] - [--minnovelty [MINNOVELTY [MINNOVELTY ...]]] - [--mitopattern MITOPATTERN] [--maxmt MAXMT] - [--removedoublets] [--rnadbr RNADBR] - [--rnadbrsd RNADBRSD] [--pdf] [--verbose] [--h5seurat] - [--h5ad] [--cbbuild] [--output OUTPUT] - [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] - [--cpus CPUS] [--memory MEMORY] - - Single-cell RNA-Seq Filtering Analysis + usage: /usr/local/bin/sc_rna_filter.R [-h] --mex MEX [MEX ...] --identity + IDENTITY [--grouping GROUPING] + [--barcodes BARCODES] + [--rnamincells RNAMINCELLS] + [--mingenes [MINGENES [MINGENES ...]]] + [--maxgenes [MAXGENES [MAXGENES ...]]] + [--minumis [MINUMIS [MINUMIS ...]]] + [--minnovelty [MINNOVELTY [MINNOVELTY ...]]] + [--mitopattern MITOPATTERN] + [--maxmt MAXMT] [--removedoublets] + [--rnadbr RNADBR] [--rnadbrsd RNADBRSD] + [--pdf] [--verbose] [--h5seurat] + [--h5ad] [--cbbuild] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + [--seed SEED] + + Single-Cell RNA-Seq Filtering Analysis optional arguments: -h, --help show this help message and exit @@ -895,21 +947,26 @@ s:about: | Include cells where at least this many genes are detected. If multiple values provided, each of them will be applied to the correspondent dataset from the - '--mex' input based on the '--identity' file. Default: + '--mex' input based on the '--identity' file. Any 0 + will be replaced with the auto-estimated threshold + (median - 2.5 * MAD) calculated per dataset. Default: 250 (applied to all datasets) --maxgenes [MAXGENES [MAXGENES ...]] Include cells with the number of genes not bigger than this value. If multiple values provided, each of them will be applied to the correspondent dataset from the - '--mex' input based on the '--identity' file. Default: + '--mex' input based on the '--identity' file. Any 0 + will be replaced with the auto-estimated threshold + (median + 5 * MAD) calculated per dataset. Default: 5000 (applied to all datasets) --minumis [MINUMIS [MINUMIS ...]] - Include cells where at least this many RNA reads - are detected. If multiple values - provided, each of them will be applied to the - correspondent dataset from the '--mex' input based on - the '--identity' file. Default: 500 (applied to all - datasets) + Include cells where at least this many RNA reads are + detected. If multiple values provided, each of them + will be applied to the correspondent dataset from the + '--mex' input based on the '--identity' file. Any 0 + will be replaced with the auto-estimated threshold + (median - 2.5 * MAD) calculated per dataset. Default: + 500 (applied to all datasets) --minnovelty [MINNOVELTY [MINNOVELTY ...]] Include cells with the novelty score not lower than this value, calculated for as log10(genes)/log10(UMI). @@ -920,9 +977,11 @@ s:about: | --mitopattern MITOPATTERN Regex pattern to identify mitochondrial genes. Default: '^mt-|^MT-' - --maxmt MAXMT Include cells with the percentage of RNA reads - mapped to mitochondrial genes not bigger than this - value. Default: 5 (applied to all datasets) + --maxmt MAXMT Include cells with the percentage of RNA reads mapped + to mitochondrial genes not bigger than this value. Set + to 0 for using an auto-estimated threshold equal to + the maximum among (median + 2 * MAD) values calculated + per dataset. Default: 5 (applied to all datasets) --removedoublets Remove cells that were identified as doublets. Cells with RNA UMI < 200 will not be evaluated. Default: do not remove doublets @@ -936,11 +995,13 @@ s:about: | --pdf Export plots in PDF. Default: false --verbose Print debug information. Default: false --h5seurat Save Seurat data to h5seurat file. Default: false - --h5ad Save Seurat data to h5ad file. Default: false + --h5ad Save raw counts from the RNA assay to h5ad file. + Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --output OUTPUT Output prefix. Default: ./sc --theme {gray,bw,linedraw,light,dark,minimal,classic,void} Color theme for all generated plots. Default: classic --cpus CPUS Number of cores/cpus to use. Default: 1 --memory MEMORY Maximum memory in GB allowed to be shared between the - workers when using multiple '--cpus'. Default: 32 \ No newline at end of file + workers when using multiple '--cpus'. Default: 32 + --seed SEED Seed number for random values. Default: 42 \ No newline at end of file diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index 6c5620f7..32e9f329 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.33 + dockerPull: biowardrobe2/sc-tools:v0.0.34 inputs: @@ -98,7 +98,7 @@ inputs: Normalization method applied to genes expression counts. If loaded Seurat object includes multiple datasets, normalization will be run independently for each of them, unless integration is disabled with 'none' or set to 'harmony' - Default: sct + Default: sctglm integration_method: type: @@ -183,7 +183,8 @@ inputs: doc: | Dimensionality to use for datasets integration (if provided RDS file includes multiple datasets and --ntgr is not set to 'harmony') - and UMAP projection (from 1 to 50). + and UMAP projection (from 1 to 50). Set to 0 to use auto-estimated + dimensionality. Default: 10 umap_spread: @@ -313,7 +314,7 @@ inputs: inputBinding: prefix: "--h5ad" doc: | - Save Seurat data to h5ad file. + Save raw counts from the RNA assay to h5ad file. Default: false export_scope_data: @@ -374,6 +375,14 @@ inputs: Number of cores/cpus to use. Default: 1 + seed: + type: int? + inputBinding: + prefix: "--seed" + doc: | + Seed number for random values. + Default: 42 + outputs: @@ -676,49 +685,54 @@ outputs: outputBinding: glob: "*_cellbrowser" doc: | - Directory with UCSC Cellbrowser configuration data. + UCSC Cell Browser configuration data. ucsc_cb_html_data: type: Directory? outputBinding: glob: "*_cellbrowser/html_data" doc: | - Directory with UCSC Cellbrowser html data. + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputBinding: glob: "*_cellbrowser/html_data/index.html" doc: | - HTML index file from the directory with UCSC Cellbrowser html data. + UCSC Cell Browser html index. seurat_data_rds: type: File outputBinding: glob: "*_data.rds" doc: | - Reduced Seurat data in RDS format + Seurat object. + RDS format seurat_data_h5seurat: type: File? outputBinding: glob: "*_data.h5seurat" doc: | - Reduced Seurat data in h5seurat format + Seurat object. + h5Seurat format seurat_data_h5ad: type: File? outputBinding: - glob: "*_data.h5ad" + glob: "*_counts.h5ad" doc: | - Reduced Seurat data in h5ad format + Seurat object. + H5AD format seurat_data_scope: type: File? outputBinding: glob: "*_data.loom" doc: | - Reduced Seurat data in SCope compatible loom format + Seurat object. + SCope compatible. + Loom format stdout_log: type: stdout @@ -740,8 +754,8 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell RNA-Seq Dimensionality Reduction Analysis" -s:name: "Single-cell RNA-Seq Dimensionality Reduction Analysis" +label: "Single-Cell RNA-Seq Dimensionality Reduction Analysis" +s:name: "Single-Cell RNA-Seq Dimensionality Reduction Analysis" s:alternateName: "Integrates multiple single-cell RNA-Seq datasets, reduces dimensionality using PCA" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-rna-reduce.cwl @@ -780,13 +794,13 @@ s:creator: doc: | - Single-cell RNA-Seq Dimensionality Reduction Analysis + Single-Cell RNA-Seq Dimensionality Reduction Analysis Integrates multiple single-cell RNA-Seq datasets, reduces dimensionality using PCA. s:about: | - usage: sc_rna_reduce.R [-h] --query QUERY [--metadata METADATA] + usage: /usr/local/bin/sc_rna_reduce.R [-h] --query QUERY [--metadata METADATA] [--barcodes BARCODES] [--cellcycle CELLCYCLE] [--norm {sct,log,sctglm}] @@ -807,8 +821,9 @@ s:about: | [--lowmem] [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] + [--seed SEED] - Single-cell RNA-Seq Dimensionality Reduction Analysis + Single-Cell RNA-Seq Dimensionality Reduction Analysis optional arguments: -h, --help show this help message and exit @@ -881,7 +896,8 @@ s:about: | Dimensionality to use for datasets integration (if provided RDS file includes multiple datasets and --ntgr is not set to 'harmony') and UMAP projection - (from 1 to 50). Default: 10 + (from 1 to 50). Set to 0 to use auto-estimated + dimensionality. Default: 10 --uspread USPREAD The effective scale of embedded points on UMAP. In combination with '--mindist' it determines how clustered/clumped the embedded points are. Default: 1 @@ -906,7 +922,8 @@ s:about: | --pdf Export plots in PDF. Default: false --verbose Print debug information. Default: false --h5seurat Save Seurat data to h5seurat file. Default: false - --h5ad Save Seurat data to h5ad file. Default: false + --h5ad Save raw counts from the RNA assay to h5ad file. + Default: false --scope Save Seurat data to SCope compatible loom file. Default: false --cbbuild Export results to UCSC Cell Browser. Default: false @@ -920,4 +937,5 @@ s:about: | Color theme for all generated plots. Default: classic --cpus CPUS Number of cores/cpus to use. Default: 1 --memory MEMORY Maximum memory in GB allowed to be shared between the - workers when using multiple --cpus. Default: 32 \ No newline at end of file + workers when using multiple --cpus. Default: 32 + --seed SEED Seed number for random values. Default: 42 \ No newline at end of file diff --git a/tools/sc-rna-trajectory.cwl b/tools/sc-rna-trajectory.cwl index b3619ab7..c985a914 100644 --- a/tools/sc-rna-trajectory.cwl +++ b/tools/sc-rna-trajectory.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.33 + dockerPull: biowardrobe2/sc-tools:v0.0.34 inputs: @@ -137,7 +137,7 @@ inputs: inputBinding: prefix: "--h5ad" doc: | - Save Seurat data to h5ad file. + Save raw counts from the RNA assay to h5ad file. Default: false export_ucsc_cb: @@ -179,6 +179,14 @@ inputs: Number of cores/cpus to use. Default: 1 + seed: + type: int? + inputBinding: + prefix: "--seed" + doc: | + Seed number for random values. + Default: 42 + outputs: @@ -559,45 +567,45 @@ outputs: outputBinding: glob: "*_cellbrowser" doc: | - Directory with UCSC Cellbrowser - configuration data. + UCSC Cell Browser configuration data. ucsc_cb_html_data: type: Directory? outputBinding: glob: "*_cellbrowser/html_data" doc: | - Directory with UCSC Cellbrowser - html data. + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputBinding: glob: "*_cellbrowser/html_data/index.html" doc: | - HTML index file from the directory - with UCSC Cellbrowser html data. + UCSC Cell Browser html index. seurat_data_rds: type: File outputBinding: glob: "*_data.rds" doc: | - Reduced Seurat data in RDS format + Seurat object. + RDS format seurat_data_h5seurat: type: File? outputBinding: glob: "*_data.h5seurat" doc: | - Reduced Seurat data in h5seurat format + Seurat object. + h5Seurat format seurat_data_h5ad: type: File? outputBinding: - glob: "*_data.h5ad" + glob: "*_counts.h5ad" doc: | - Reduced Seurat data in h5ad format + Seurat object. + H5AD format stdout_log: type: stdout @@ -619,8 +627,8 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell RNA-Seq Trajectory Analysis" -s:name: "Single-cell RNA-Seq Trajectory Analysis" +label: "Single-Cell RNA-Seq Trajectory Analysis" +s:name: "Single-Cell RNA-Seq Trajectory Analysis" s:alternateName: "Aligns cells along the trajectory defined based on PCA or other dimensionality reduction" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-rna-trajectory.cwl @@ -659,14 +667,14 @@ s:creator: doc: | - Single-cell RNA-Seq Trajectory Analysis + Single-Cell RNA-Seq Trajectory Analysis Aligns cells along the trajectory defined based on PCA or other dimensionality reduction s:about: | - usage: sc_rna_trajectory.R [-h] --query QUERY + usage: /usr/local/bin/sc_rna_trajectory.R [-h] --query QUERY [--reduction REDUCTION] [--dimensions DIMENSIONS] --source SOURCE [--barcodes BARCODES] @@ -677,8 +685,9 @@ s:about: | [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] + [--seed SEED] - Single-cell RNA-Seq Trajectory Analysis + Single-Cell RNA-Seq Trajectory Analysis optional arguments: -h, --help show this help message and exit @@ -712,11 +721,13 @@ s:about: | --pdf Export plots in PDF. Default: false --verbose Print debug information. Default: false --h5seurat Save Seurat data to h5seurat file. Default: false - --h5ad Save Seurat data to h5ad file. Default: false + --h5ad Save raw counts from the RNA assay to h5ad file. + Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --output OUTPUT Output prefix. Default: ./sc --theme {gray,bw,linedraw,light,dark,minimal,classic,void} Color theme for all generated plots. Default: classic --cpus CPUS Number of cores/cpus to use. Default: 1 --memory MEMORY Maximum memory in GB allowed to be shared between the - workers when using multiple --cpus. Default: 32 \ No newline at end of file + workers when using multiple --cpus. Default: 32 + --seed SEED Seed number for random values. Default: 42 \ No newline at end of file diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index eda197d0..43a57651 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.33 + dockerPull: biowardrobe2/sc-tools:v0.0.34 inputs: @@ -103,7 +103,7 @@ inputs: inputBinding: prefix: "--h5ad" doc: | - Save Seurat data to h5ad file. + Save raw counts from the RNA and/or ATAC assay(s) to h5ad file(s). Default: false export_ucsc_cb: @@ -145,6 +145,14 @@ inputs: Number of cores/cpus to use. Default: 1 + seed: + type: int? + inputBinding: + prefix: "--seed" + doc: | + Seed number for random values. + Default: 42 + outputs: @@ -297,42 +305,55 @@ outputs: outputBinding: glob: "*_cellbrowser" doc: | - Directory with UCSC Cellbrowser configuration data. + UCSC Cell Browser configuration data. ucsc_cb_html_data: type: Directory? outputBinding: glob: "*_cellbrowser/html_data" doc: | - Directory with UCSC Cellbrowser html data. + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputBinding: glob: "*_cellbrowser/html_data/index.html" doc: | - HTML index file from the directory with UCSC Cellbrowser html data. + UCSC Cell Browser html index. seurat_data_rds: type: File outputBinding: glob: "*_data.rds" doc: | - Seurat data in RDS format + Seurat object. + RDS format seurat_data_h5seurat: type: File? outputBinding: glob: "*_data.h5seurat" doc: | - Seurat data in h5seurat format + Seurat object. + h5Seurat format + + seurat_rna_data_h5ad: + type: File? + outputBinding: + glob: "*_rna_counts.h5ad" + doc: | + Seurat object. + RNA counts. + H5AD format. - seurat_data_h5ad: + seurat_atac_data_h5ad: type: File? outputBinding: - glob: "*_data.h5ad" + glob: "*_atac_counts.h5ad" doc: | - Seurat data in h5ad format + Seurat object. + ATAC counts. + H5AD format. stdout_log: type: stdout @@ -354,8 +375,8 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell Label Integration Analysis" -s:name: "Single-cell Label Integration Analysis" +label: "Single-Cell Label Integration Analysis" +s:name: "Single-Cell Label Integration Analysis" s:alternateName: "Harmonizes conflicting annotations in single-cell genomics studies" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-triangulate.cwl @@ -394,20 +415,22 @@ s:creator: doc: | - Single-cell Label Integration Analysis + Single-Cell Label Integration Analysis Harmonizes conflicting annotations in single-cell genomics studies. s:about: | - usage: sc_triangulate.R - [-h] --query QUERY [--barcodes BARCODES] --source SOURCE [SOURCE ...] - [--target TARGET] [--pdf] [--verbose] [--h5seurat] [--h5ad] [--cbbuild] - [--output OUTPUT] - [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] - [--cpus CPUS] [--memory MEMORY] + usage: /usr/local/bin/sc_triangulate.R [-h] --query QUERY + [--barcodes BARCODES] --source SOURCE + [SOURCE ...] [--target TARGET] [--pdf] + [--verbose] [--h5seurat] [--h5ad] + [--cbbuild] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + [--seed SEED] - Single-cell Label Integration Analysis + Single-Cell Label Integration Analysis optional arguments: -h, --help show this help message and exit @@ -432,11 +455,13 @@ s:about: | --pdf Export plots in PDF. Default: false --verbose Print debug information. Default: false --h5seurat Save Seurat data to h5seurat file. Default: false - --h5ad Save Seurat data to h5ad file. Default: false + --h5ad Save raw counts from the RNA and/or ATAC assay(s) to + h5ad file(s). Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --output OUTPUT Output prefix. Default: ./sc --theme {gray,bw,linedraw,light,dark,minimal,classic,void} Color theme for all generated plots. Default: classic --cpus CPUS Number of cores/cpus to use. Default: 1 --memory MEMORY Maximum memory in GB allowed to be shared between the - workers when using multiple --cpus. Default: 32 \ No newline at end of file + workers when using multiple --cpus. Default: 32 + --seed SEED Seed number for random values. Default: 42 \ No newline at end of file diff --git a/tools/sc-vdj-profile.cwl b/tools/sc-vdj-profile.cwl index f4e73ca1..a4ababfa 100644 --- a/tools/sc-vdj-profile.cwl +++ b/tools/sc-vdj-profile.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.33 + dockerPull: biowardrobe2/sc-tools:v0.0.34 inputs: @@ -162,7 +162,7 @@ inputs: inputBinding: prefix: "--h5ad" doc: | - Save Seurat data to h5ad file. + Save raw counts from the RNA assay to h5ad file. Default: false export_scope_data: @@ -216,6 +216,14 @@ inputs: Number of cores/cpus to use. Default: 1 + seed: + type: int? + inputBinding: + prefix: "--seed" + doc: | + Seed number for random values. + Default: 42 + outputs: @@ -622,53 +630,54 @@ outputs: outputBinding: glob: "*_cellbrowser" doc: | - Directory with UCSC Cellbrowser - configuration data. + UCSC Cell Browser configuration data. ucsc_cb_html_data: type: Directory? outputBinding: glob: "*_cellbrowser/html_data" doc: | - Directory with UCSC Cellbrowser - html data. + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputBinding: glob: "*_cellbrowser/html_data/index.html" doc: | - HTML index file from the directory - with UCSC Cellbrowser html data. + UCSC Cell Browser html index. seurat_data_rds: type: File outputBinding: glob: "*_data.rds" doc: | - Reduced Seurat data in RDS format + Seurat object. + RDS format seurat_data_h5seurat: type: File? outputBinding: glob: "*_data.h5seurat" doc: | - Reduced Seurat data in h5seurat format + Seurat object. + h5Seurat format seurat_data_h5ad: type: File? outputBinding: - glob: "*_data.h5ad" + glob: "*_counts.h5ad" doc: | - Reduced Seurat data in h5ad format + Seurat object. + H5AD format seurat_data_scope: type: File? outputBinding: glob: "*_data.loom" doc: | - Reduced Seurat data in SCope - compatible loom format + Seurat object. + SCope compatible. + Loom format stdout_log: type: stdout @@ -690,8 +699,8 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell Immune Profiling Analysis" -s:name: "Single-cell Immune Profiling Analysis" +label: "Single-Cell Immune Profiling Analysis" +s:name: "Single-Cell Immune Profiling Analysis" s:alternateName: "TCR/BCR clonotype dynamics analysis" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-vdj-profile.cwl @@ -730,23 +739,26 @@ s:creator: doc: | - Single-cell Immune Profiling Analysis + Single-Cell Immune Profiling Analysis TCR/BCR clonotype dynamics analysis s:about: | - usage: sc_vdj_profile.R [-h] --query QUERY --contigs CONTIGS - [--metadata METADATA] [--barcodes BARCODES] - --source SOURCE - [--cloneby {gene,nt,aa,strict}] [--groupby GROUPBY] - [--strictness {removemulti,filtermulti}] [--pdf] - [--verbose] [--h5seurat] [--h5ad] [--cbbuild] - [--scope] [--output OUTPUT] - [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] - [--cpus CPUS] [--memory MEMORY] - - Single-cell Immune Profiling Analysis + usage: /usr/local/bin/sc_vdj_profile.R [-h] --query QUERY --contigs CONTIGS + [--metadata METADATA] + [--barcodes BARCODES] --source SOURCE + [--cloneby {gene,nt,aa,strict}] + [--groupby GROUPBY] + [--strictness {removemulti,filtermulti}] + [--pdf] [--verbose] [--h5seurat] + [--h5ad] [--cbbuild] [--scope] + [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + [--seed SEED] + + Single-Cell Immune Profiling Analysis optional arguments: -h, --help show this help message and exit @@ -794,7 +806,8 @@ s:about: | --pdf Export plots in PDF. Default: false --verbose Print debug information. Default: false --h5seurat Save Seurat data to h5seurat file. Default: false - --h5ad Save Seurat data to h5ad file. Default: false + --h5ad Save raw counts from the RNA assay to h5ad file. + Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --scope Save Seurat data to SCope compatible loom file. Default: false @@ -803,4 +816,5 @@ s:about: | Color theme for all generated plots. Default: classic --cpus CPUS Number of cores/cpus to use. Default: 1 --memory MEMORY Maximum memory in GB allowed to be shared between the - workers when using multiple --cpus. Default: 32 \ No newline at end of file + workers when using multiple --cpus. Default: 32 + --seed SEED Seed number for random values. Default: 42 \ No newline at end of file diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index 5d1723fa..c63f975b 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.33 + dockerPull: biowardrobe2/sc-tools:v0.0.34 inputs: @@ -23,8 +23,8 @@ inputs: doc: | Path to the RDS file to load Seurat object from. This file should include genes expression and chromatin accessibility information stored in the RNA - and ATAC assays correspondingly. Additionally, 'pca', 'rnaumap', 'atac_lsi' - and 'atacumap' dimensionality reductions should be present. + and ATAC assays correspondingly. Additionally, 'pca' and 'atac_lsi' + dimensionality reductions should be present. rna_dimensions: type: int? @@ -353,7 +353,7 @@ inputs: inputBinding: prefix: "--h5ad" doc: | - Save Seurat data to h5ad file. + Save raw counts from the RNA and ATAC assays to h5ad files. Default: false export_scope_data: @@ -404,54 +404,150 @@ inputs: Number of cores/cpus to use. Default: 1 + seed: + type: int? + inputBinding: + prefix: "--seed" + doc: | + Seed number for random values. + Default: 42 + outputs: - umap_res_plot_png: + umap_gr_ph_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_ph_spl_idnt.png" + doc: | + UMAP colored by cell cycle phase. + Split by dataset; downsampled to the + smallest dataset. + PNG format. + + umap_gr_ph_spl_idnt_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_gr_ph_spl_idnt.pdf" + doc: | + UMAP colored by cell cycle phase. + Split by dataset; downsampled to the + smallest dataset. + PDF format. + + cmp_gr_ph_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_cmp_gr_ph_spl_idnt.png" + doc: | + Composition plot colored by cell cycle phase. + Split by dataset; downsampled to the smallest + dataset. + PNG format + + cmp_gr_ph_spl_idnt_plot_pdf: + type: File? + outputBinding: + glob: "*_cmp_gr_ph_spl_idnt.pdf" + doc: | + Composition plot colored by cell cycle phase. + Split by dataset; downsampled to the smallest + dataset. + PDF format + + umap_gr_ph_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_ph_spl_cnd.png" + doc: | + UMAP colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + + umap_gr_ph_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_umap_gr_ph_spl_cnd.pdf" + doc: | + UMAP colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PDF format. + + cmp_gr_ph_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_cmp_gr_ph_spl_cnd.png" + doc: | + Composition plot colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + + cmp_gr_ph_spl_cnd_plot_pdf: + type: File? + outputBinding: + glob: "*_cmp_gr_ph_spl_cnd.pdf" + doc: | + Composition plot colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PDF format. + + umap_gr_clst_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_res_*.png" + glob: "*_umap_gr_clst_res_*.png" doc: | - UMAP, colored by cluster. + UMAP colored by cluster. + All cells; all resolutions. PNG format - umap_res_plot_pdf: + umap_gr_clst_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_res_*.pdf" + glob: "*_umap_gr_clst_res_*.pdf" doc: | - UMAP, colored by cluster. + UMAP colored by cluster. + All cells; all resolutions. PDF format - umap_spl_idnt_res_plot_png: + umap_gr_clst_spl_idnt_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_spl_idnt_res_*.png" + glob: "*_umap_gr_clst_spl_idnt_res_*.png" doc: | - UMAP, colored by cluster, - split by dataset. - PNG format + UMAP colored by cluster. + Split by dataset; downsampled to the + smallest dataset; all resolutions. + PNG format. - umap_spl_idnt_res_plot_pdf: + umap_gr_clst_spl_idnt_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_spl_idnt_res_*.pdf" + glob: "*_umap_gr_clst_spl_idnt_res_*.pdf" doc: | - UMAP, colored by cluster, - split by dataset. - PDF format + UMAP colored by cluster. + Split by dataset; downsampled to the + smallest dataset; all resolutions. + PDF format. cmp_gr_clst_spl_idnt_res_plot_png: type: @@ -461,10 +557,10 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_idnt_res_*.png" doc: | - Composition plot, colored by - cluster, split by dataset, - downsampled. - PNG format + Composition plot colored by cluster. + Split by dataset; downsampled to the + smallest dataset; all resolutions. + PNG format. cmp_gr_clst_spl_idnt_res_plot_pdf: type: @@ -474,10 +570,10 @@ outputs: outputBinding: glob: "*_cmp_gr_clst_spl_idnt_res_*.pdf" doc: | - Composition plot, colored by - cluster, split by dataset, - downsampled. - PDF format + Composition plot colored by cluster. + Split by dataset; downsampled to the + smallest dataset; all resolutions. + PDF format. cmp_gr_idnt_spl_clst_res_plot_png: type: @@ -487,10 +583,10 @@ outputs: outputBinding: glob: "*_cmp_gr_idnt_spl_clst_res_*.png" doc: | - Composition plot, colored by - dataset, split by cluster, - downsampled. - PNG format + Composition plot colored by dataset. + Split by cluster; downsampled to the + smallest dataset; all resolutions. + PNG format. cmp_gr_idnt_spl_clst_res_plot_pdf: type: @@ -500,222 +596,226 @@ outputs: outputBinding: glob: "*_cmp_gr_idnt_spl_clst_res_*.pdf" doc: | - Composition plot, colored by - dataset, split by cluster, - downsampled. - PDF format + Composition plot colored by dataset. + Split by cluster; downsampled to the + smallest dataset; all resolutions. + PDF format. - umap_spl_cnd_res_plot_png: + umap_gr_clst_spl_ph_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_spl_cnd_res_*.png" + glob: "*_umap_gr_clst_spl_ph_res_*.png" doc: | - UMAP, colored by cluster, - split by grouping condition. - PNG format + UMAP colored by cluster. + Split by cell cycle phase; downsampled + to the smallest dataset (if multiple + datasets are analyzed jointly); all + resolutions. + PNG format. - umap_spl_cnd_res_plot_pdf: + umap_gr_clst_spl_ph_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_spl_cnd_res_*.pdf" + glob: "*_umap_gr_clst_spl_ph_res_*.pdf" doc: | - UMAP, colored by cluster, - split by grouping condition. - PDF format + UMAP colored by cluster. + Split by cell cycle phase; downsampled + to the smallest dataset (if multiple + datasets are analyzed jointly); all + resolutions. + PDF format. - cmp_gr_clst_spl_cnd_res_plot_png: + cmp_gr_ph_spl_clst_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_cmp_gr_clst_spl_cnd_res_*.png" + glob: "*_cmp_gr_ph_spl_clst_res_*.png" doc: | - Composition plot, colored by - cluster, split by grouping - condition, downsampled. + Composition plot colored by cell cycle phase. + Split by cell cycle phase; downsampled + to the smallest dataset (if multiple + datasets are analyzed jointly); all + resolutions. PNG format - cmp_gr_clst_spl_cnd_res_plot_pdf: + cmp_gr_ph_spl_clst_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_cmp_gr_clst_spl_cnd_res_*.pdf" + glob: "*_cmp_gr_ph_spl_clst_res_*.pdf" doc: | - Composition plot, colored by - cluster, split by grouping - condition, downsampled. + Composition plot colored by cell cycle phase. + Split by cell cycle phase; downsampled + to the smallest dataset (if multiple + datasets are analyzed jointly); all + resolutions. PDF format - cmp_gr_cnd_spl_clst_res_plot_png: + umap_gr_clst_spl_cnd_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_cmp_gr_cnd_spl_clst_res_*.png" + glob: "*_umap_gr_clst_spl_cnd_res_*.png" doc: | - Composition plot, colored by - grouping condition, split by - cluster, downsampled. - PNG format + UMAP colored by cluster. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group; all resolutions. + PNG format. - cmp_gr_cnd_spl_clst_res_plot_pdf: + umap_gr_clst_spl_cnd_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_cmp_gr_cnd_spl_clst_res_*.pdf" + glob: "*_umap_gr_clst_spl_cnd_res_*.pdf" doc: | - Composition plot, colored by - grouping condition, split by - cluster, downsampled. - PDF format + UMAP colored by cluster. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group; all resolutions. + PDF format. - umap_spl_ph_res_plot_png: + cmp_gr_clst_spl_cnd_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_spl_ph_res_*.png" + glob: "*_cmp_gr_clst_spl_cnd_res_*.png" doc: | - UMAP, colored by cluster, - split by cell cycle phase. - PNG format + Composition plot colored by cluster. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group; all resolutions. + PNG format. - umap_spl_ph_res_plot_pdf: + cmp_gr_clst_spl_cnd_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_spl_ph_res_*.pdf" - doc: | - UMAP, colored by cluster, - split by cell cycle phase. - PDF format - - cmp_gr_ph_spl_idnt_plot_png: - type: File? - outputBinding: - glob: "*_cmp_gr_ph_spl_idnt.png" - doc: | - Composition plot, colored by - cell cycle phase, split by - dataset, downsampled. - PNG format - - cmp_gr_ph_spl_idnt_plot_pdf: - type: File? - outputBinding: - glob: "*_cmp_gr_ph_spl_idnt.pdf" + glob: "*_cmp_gr_clst_spl_cnd_res_*.pdf" doc: | - Composition plot, colored by - cell cycle phase, split by - dataset, downsampled. - PDF format + Composition plot colored by cluster. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group; all resolutions. + PDF format. - cmp_gr_ph_spl_clst_res_plot_png: + cmp_gr_cnd_spl_clst_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_cmp_gr_ph_spl_clst_res_*.png" + glob: "*_cmp_gr_cnd_spl_clst_res_*.png" doc: | - Composition plot, colored by - cell cycle phase, split by - cluster, downsampled. - PNG format + Composition plot colored by grouping condition. + Split by cluster; first downsampled to the + smallest dataset, then downsampled to the + smallest group; all resolutions. + PNG format. - cmp_gr_ph_spl_clst_res_plot_pdf: + cmp_gr_cnd_spl_clst_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_cmp_gr_ph_spl_clst_res_*.pdf" + glob: "*_cmp_gr_cnd_spl_clst_res_*.pdf" doc: | - Composition plot, colored by - cell cycle phase, split by - cluster, downsampled. - PDF format + Composition plot colored by grouping condition. + Split by cluster; first downsampled to the + smallest dataset, then downsampled to the + smallest group; all resolutions. + PDF format. - xpr_avg_res_plot_png: + xpr_per_cell_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_avg_res_*.png" + glob: "*_xpr_per_cell_*.png" doc: | - Gene expression dot plot. - PNG format + UMAP colored by gene expression. + All genes of interest. + PNG format. - xpr_avg_res_plot_pdf: + xpr_per_cell_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_avg_res_*.pdf" + glob: "*_xpr_per_cell_*.pdf" doc: | - Gene expression dot plot. - PDF format + UMAP colored by gene expression. + All genes of interest. + PDF format. - xpr_per_cell_plot_png: + xpr_per_cell_sgnl_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_per_cell_[!sgnl_]*.png" + glob: "*_xpr_per_cell_sgnl_*.png" doc: | - UMAP, gene expression. - PNG format + UMAP colored by gene expression density. + All genes of interest. + PNG format. - xpr_per_cell_plot_pdf: + xpr_per_cell_sgnl_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_per_cell_[!sgnl_]*.pdf" + glob: "*_xpr_per_cell_sgnl_*.pdf" doc: | - UMAP, gene expression. - PDF format + UMAP colored by gene expression density. + All genes of interest. + PDF format. - xpr_per_cell_sgnl_plot_png: + xpr_avg_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_per_cell_sgnl_*.png" + glob: "*_xpr_avg_res_*.png" doc: | - UMAP, gene expression density. - PNG format + Average gene expression. + All resolutions. + PNG format. - xpr_per_cell_sgnl_plot_pdf: + xpr_avg_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_per_cell_sgnl_*.pdf" + glob: "*_xpr_avg_res_*.pdf" doc: | - UMAP, gene expression density. - PDF format + Average gene expression. + All resolutions. + PDF format. xpr_dnst_res_plot_png: type: @@ -725,8 +825,9 @@ outputs: outputBinding: glob: "*_xpr_dnst_res_*.png" doc: | - Gene expression violin plot. - PNG format + Gene expression density. + All genes of interest; all resolutions. + PNG format. xpr_dnst_res_plot_pdf: type: @@ -736,131 +837,151 @@ outputs: outputBinding: glob: "*_xpr_dnst_res_*.pdf" doc: | - Gene expression violin plot. - PDF format + Gene expression density. + All genes of interest; all resolutions. + PDF format. - cvrg_res_plot_png: + xpr_htmp_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_cvrg_res_*.png" + glob: "*_xpr_htmp_res_*.png" doc: | - ATAC fragments coverage. - PNG format + Gene expression heatmap. + Top gene markers; all resolutions. + PNG format. - cvrg_res_plot_pdf: + xpr_htmp_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_cvrg_res_*.pdf" + glob: "*_xpr_htmp_res_*.pdf" doc: | - ATAC fragments coverage. - PDF format + Gene expression heatmap. + Top gene markers; all resolutions. + PDF format. - xpr_htmp_res_plot_png: + xpr_htmp_res_tsv: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_htmp_res_*.png" + glob: "*_xpr_htmp_res_*.tsv" doc: | Gene expression heatmap. - PNG format + Top gene markers; all resolutions. + TSV format. - xpr_htmp_res_plot_pdf: + cvrg_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_htmp_res_*.pdf" + glob: "*_cvrg_res_*.png" doc: | - Gene expression heatmap. - PDF format + ATAC fragment coverage. + All genes of interest; all resolutions. + PNG format. - xpr_htmp_res_tsv: + cvrg_res_plot_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_htmp_res_*.tsv" + glob: "*_cvrg_res_*.pdf" doc: | - Gene markers used for gene - expression heatmap. - TSV format + ATAC fragment coverage. + All genes of interest; all resolutions. + PDF format. gene_markers_tsv: type: File? outputBinding: glob: "*_gene_markers.tsv" doc: | - Gene markers per cluster for - all resolutions. - TSV format + Gene markers. + All resolutions. + TSV format. peak_markers_tsv: type: File? outputBinding: glob: "*_peak_markers.tsv" doc: | - Peak markers per cluster for - all resolutions. - TSV format + Peak markers. + All resolutions. + TSV format. ucsc_cb_config_data: type: Directory? outputBinding: glob: "*_cellbrowser" doc: | - Directory with UCSC Cellbrowser configuration data. + UCSC Cell Browser configuration data. ucsc_cb_html_data: type: Directory? outputBinding: glob: "*_cellbrowser/html_data" doc: | - Directory with UCSC Cellbrowser html data. + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputBinding: glob: "*_cellbrowser/html_data/index.html" doc: | - HTML index file from the directory with UCSC Cellbrowser html data. + UCSC Cell Browser html index. seurat_data_rds: type: File outputBinding: glob: "*_data.rds" doc: | - Reduced Seurat data in RDS format + Seurat object. + RDS format seurat_data_h5seurat: type: File? outputBinding: glob: "*_data.h5seurat" doc: | - Reduced Seurat data in h5seurat format + Seurat object. + h5Seurat format + + seurat_rna_data_h5ad: + type: File? + outputBinding: + glob: "*_rna_counts.h5ad" + doc: | + Seurat object. + RNA counts. + H5AD format. - seurat_data_h5ad: + seurat_atac_data_h5ad: type: File? outputBinding: - glob: "*_data.h5ad" + glob: "*_atac_counts.h5ad" doc: | - Reduced Seurat data in h5ad format + Seurat object. + ATAC counts. + H5AD format. seurat_data_scope: type: File? outputBinding: glob: "*_data.loom" doc: | - Reduced Seurat data in SCope compatible loom format + Seurat object. + SCope compatible. + Loom format stdout_log: type: stdout @@ -882,8 +1003,8 @@ $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell WNN Cluster Analysis" -s:name: "Single-cell WNN Cluster Analysis" +label: "Single-Cell WNN Cluster Analysis" +s:name: "Single-Cell WNN Cluster Analysis" s:alternateName: "Clusters multiome ATAC and RNA-Seq datasets, identifies gene markers and differentially accessible peaks" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-wnn-cluster.cwl @@ -922,14 +1043,14 @@ s:creator: doc: | - Single-cell WNN Cluster Analysis + Single-Cell WNN Cluster Analysis Clusters multiome ATAC and RNA-Seq datasets, identifies gene markers and differentially accessible peaks. s:about: | - usage: sc_wnn_cluster.R [-h] --query QUERY + usage: /usr/local/bin/sc_wnn_cluster.R [-h] --query QUERY [--rnadimensions RNADIMENSIONS] [--atacdimensions ATACDIMENSIONS] [--algorithm {louvain,mult-louvain,slm,leiden}] @@ -954,17 +1075,18 @@ s:about: | [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] + [--seed SEED] - Single-cell WNN Cluster Analysis + Single-Cell WNN Cluster Analysis optional arguments: -h, --help show this help message and exit --query QUERY Path to the RDS file to load Seurat object from. This file should include genes expression and chromatin accessibility information stored in the RNA and ATAC - assays correspondingly. Additionally, 'pca', - 'rnaumap', 'atac_lsi' and 'atacumap' dimensionality - reductions should be present. + assays correspondingly. Additionally, 'pca' and + 'atac_lsi' dimensionality reductions should be + present. --rnadimensions RNADIMENSIONS Dimensionality from the 'pca' reduction to use when constructing weighted nearest-neighbor graph before @@ -1067,7 +1189,8 @@ s:about: | --pdf Export plots in PDF. Default: false --verbose Print debug information. Default: false --h5seurat Save Seurat data to h5seurat file. Default: false - --h5ad Save Seurat data to h5ad file. Default: false + --h5ad Save raw counts from the RNA and ATAC assays to h5ad + files. Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --scope Save Seurat data to SCope compatible loom file. Only not normalized raw counts from the RNA assay will be @@ -1077,4 +1200,5 @@ s:about: | Color theme for all generated plots. Default: classic --cpus CPUS Number of cores/cpus to use. Default: 1 --memory MEMORY Maximum memory in GB allowed to be shared between the - workers when using multiple --cpus. Default: 32 \ No newline at end of file + workers when using multiple --cpus. Default: 32 + --seed SEED Seed number for random values. Default: 42 \ No newline at end of file diff --git a/workflows/cellranger-aggr.cwl b/workflows/cellranger-aggr.cwl index 689872f0..77624894 100644 --- a/workflows/cellranger-aggr.cwl +++ b/workflows/cellranger-aggr.cwl @@ -9,7 +9,7 @@ requirements: - class: MultipleInputFeatureRequirement -'sd:upstream': +"sd:upstream": sc_experiment: - "single-cell-preprocess-cellranger.cwl" - "cellranger-multi.cwl" @@ -19,7 +19,7 @@ inputs: alias: type: string - label: "Experiment short name/Alias" + label: "Experiment short name/alias" sd:preview: position: 1 @@ -33,18 +33,18 @@ inputs: that produces gene expression and, optionally, V(D)J contigs data, from a single 10x Genomics library - 'sd:upstreamSource': "sc_experiment/molecule_info_h5" - 'sd:localLabel': true + "sd:upstreamSource": "sc_experiment/molecule_info_h5" + "sd:localLabel": true filtered_data_folder: type: - "null" - Directory[] - 'sd:upstreamSource': "sc_experiment/filtered_data_folder" + "sd:upstreamSource": "sc_experiment/filtered_data_folder" gem_well_labels: type: string[] - 'sd:upstreamSource': "sc_experiment/alias" + "sd:upstreamSource": "sc_experiment/alias" normalization_mode: type: @@ -56,7 +56,7 @@ inputs: default: "none" label: "Library depth normalization mode" doc: "Library depth normalization mode" - 'sd:layout': + "sd:layout": advanced: true clonotype_grouping: @@ -74,15 +74,15 @@ inputs: When cellranger aggr is called with cellranger multi outputs, there are three ways it can process the datasets depending on the combination of donor and origin values - 'sd:layout': + "sd:layout": advanced: true threads: type: int? - default: 4 + default: 6 label: "Number of threads" doc: "Number of threads for those steps that support multithreading" - 'sd:layout': + "sd:layout": advanced: true memory_limit: @@ -90,7 +90,7 @@ inputs: default: 30 label: "Maximum memory used (GB)" doc: "Maximum memory used (GB). The same will be applied to virtual memory" - 'sd:layout': + "sd:layout": advanced: true @@ -101,9 +101,9 @@ outputs: outputSource: aggregate_counts/web_summary_report label: "Aggregated run summary metrics and charts in HTML format" doc: "Aggregated run summary metrics and charts in HTML format" - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" metrics_summary_report_json: @@ -159,10 +159,10 @@ outputs: outputSource: aggregate_counts/clonotypes_csv label: "CSV file with high-level descriptions of each clonotype" doc: "CSV file with high-level descriptions of each clonotype" - 'sd:visualPlugins': + "sd:visualPlugins": - syncfusiongrid: - tab: 'V(D)J clonotypes' - Title: 'V(D)J clonotypes' + tab: "V(D)J clonotypes" + Title: "V(D)J clonotypes" consensus_sequences_fasta: type: File? @@ -205,9 +205,9 @@ outputs: outputSource: cellbrowser_build/index_html_file label: "CellBrowser formatted Cellranger report" doc: "CellBrowser formatted Cellranger report" - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" aggregate_counts_stdout_log: diff --git a/workflows/cellranger-arc-aggr.cwl b/workflows/cellranger-arc-aggr.cwl index 1ffe4a37..3d543f0d 100644 --- a/workflows/cellranger-arc-aggr.cwl +++ b/workflows/cellranger-arc-aggr.cwl @@ -9,7 +9,7 @@ requirements: - class: MultipleInputFeatureRequirement -'sd:upstream': +"sd:upstream": sc_arc_sample: - "cellranger-arc-count.cwl" genome_indices: @@ -32,22 +32,22 @@ inputs: that produces both gene expression and chromatin accessibility data from a single 10x Genomics library - 'sd:upstreamSource': "sc_arc_sample/gex_molecule_info_h5" - 'sd:localLabel': true + "sd:upstreamSource": "sc_arc_sample/gex_molecule_info_h5" + "sd:localLabel": true gem_well_labels: type: string[] - 'sd:upstreamSource': "sc_arc_sample/alias" + "sd:upstreamSource": "sc_arc_sample/alias" atac_fragments_file_from_count: type: File[] secondaryFiles: - .tbi - 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" + "sd:upstreamSource": "sc_arc_sample/atac_fragments_file" barcode_metrics_report: type: File[] - 'sd:upstreamSource': "sc_arc_sample/barcode_metrics_report" + "sd:upstreamSource": "sc_arc_sample/barcode_metrics_report" indices_folder: type: Directory @@ -60,13 +60,13 @@ inputs: This sample can be obtained from "Cell Ranger Reference (RNA, ATAC, RNA+ATAC)" pipeline. - 'sd:upstreamSource': "genome_indices/arc_indices_folder" - 'sd:localLabel': true + "sd:upstreamSource": "genome_indices/arc_indices_folder" + "sd:localLabel": true memory_limit: type: int? default: 20 - 'sd:upstreamSource': "genome_indices/memory_limit" + "sd:upstreamSource": "genome_indices/memory_limit" normalization_mode: type: @@ -88,7 +88,7 @@ inputs: mapped to the transcriptome per cell for each gene expression library. - 'sd:layout': + "sd:layout": advanced: true threads: @@ -102,14 +102,14 @@ inputs: - "4" - "5" - "6" - default: "4" + default: "6" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. Default: 4 - 'sd:layout': + "sd:layout": advanced: true @@ -121,9 +121,9 @@ outputs: label: "Cell Ranger Summary" doc: | Report generated by Cell Ranger - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" cellbrowser_report: @@ -132,9 +132,9 @@ outputs: label: "UCSC Cell Browser" doc: | UCSC Cell Browser HTML index file - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" metrics_summary_report: diff --git a/workflows/cellranger-arc-count.cwl b/workflows/cellranger-arc-count.cwl index e15b3b9d..934e9b8b 100644 --- a/workflows/cellranger-arc-count.cwl +++ b/workflows/cellranger-arc-count.cwl @@ -9,7 +9,7 @@ requirements: - class: MultipleInputFeatureRequirement -'sd:upstream': +"sd:upstream": genome_indices: - "cellranger-mkref.cwl" @@ -33,13 +33,13 @@ inputs: This sample can be obtained from "Cell Ranger Reference (RNA, ATAC, RNA+ATAC)" pipeline. - 'sd:upstreamSource': "genome_indices/arc_indices_folder" - 'sd:localLabel': true + "sd:upstreamSource": "genome_indices/arc_indices_folder" + "sd:localLabel": true memory_limit: type: int? default: 20 - 'sd:upstreamSource': "genome_indices/memory_limit" + "sd:upstreamSource": "genome_indices/memory_limit" gex_fastq_file_r1: type: @@ -117,7 +117,7 @@ inputs: the reference are counted. Using this mode will reduce the UMI counts and decrease sensitivity. - 'sd:layout': + "sd:layout": advanced: true threads: @@ -131,14 +131,14 @@ inputs: - "4" - "5" - "6" - default: "4" + default: "6" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. Default: 4 - 'sd:layout': + "sd:layout": advanced: true @@ -150,9 +150,9 @@ outputs: label: "Cell Ranger Summary" doc: | Report generated by Cell Ranger - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" cellbrowser_report: @@ -161,9 +161,9 @@ outputs: label: "UCSC Cell Browser" doc: | UCSC Cell Browser HTML index file - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" fastqc_report_gex_fastq_r1: @@ -173,9 +173,9 @@ outputs: doc: | FastqQC report generated for RNA FASTQ file, Read 1 - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" fastqc_report_gex_fastq_r2: @@ -185,9 +185,9 @@ outputs: doc: | FastqQC report generated for RNA FASTQ file, Read 2 - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" fastqc_report_atac_fastq_r1: @@ -197,9 +197,9 @@ outputs: doc: | FastqQC report generated for ATAC FASTQ file, Read 1 - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" fastqc_report_atac_fastq_r2: @@ -209,9 +209,9 @@ outputs: doc: | FastqQC report generated for ATAC FASTQ file, Read 2 - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" fastqc_report_atac_fastq_r3: @@ -221,9 +221,9 @@ outputs: doc: | FastqQC report generated for ATAC FASTQ file, Read 3 - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" metrics_summary_report: @@ -454,9 +454,9 @@ outputs: label: "Collected statistics" doc: | Collected statistics in Markdown format - 'sd:visualPlugins': + "sd:visualPlugins": - markdownView: - tab: 'Overview' + tab: "Overview" collected_statistics_tsv: type: File @@ -464,10 +464,10 @@ outputs: label: "Collected statistics" doc: | Collected statistics in TSV format - 'sd:visualPlugins': + "sd:visualPlugins": - tableView: vertical: true - tab: 'Overview' + tab: "Overview" html_data_folder: type: Directory diff --git a/workflows/cellranger-atac-aggr.cwl b/workflows/cellranger-atac-aggr.cwl index 6276d8c0..04ba9aea 100644 --- a/workflows/cellranger-atac-aggr.cwl +++ b/workflows/cellranger-atac-aggr.cwl @@ -9,7 +9,7 @@ requirements: - class: MultipleInputFeatureRequirement -'sd:upstream': +"sd:upstream": sc_atacseq_sample: - "cellranger-atac-count.cwl" genome_indices: @@ -20,7 +20,7 @@ inputs: alias: type: string - label: "Experiment short name/Alias" + label: "Experiment short name/alias" sd:preview: position: 1 @@ -31,8 +31,8 @@ inputs: Array of GEM well identifiers to be used for labeling purposes only. If not provided use rootnames of files from the barcode_metrics_report input - 'sd:upstreamSource': "sc_atacseq_sample/alias" - 'sd:localLabel': true + "sd:upstreamSource": "sc_atacseq_sample/alias" + "sd:localLabel": true fragments_file_from_count: type: File[] @@ -43,7 +43,7 @@ inputs: Array of files containing count and barcode information for every ATAC fragment observed in the "cellranger-atac count" experiment in TSV format. - 'sd:upstreamSource': "sc_atacseq_sample/atac_fragments_file" + "sd:upstreamSource": "sc_atacseq_sample/atac_fragments_file" barcode_metrics_report_from_count: type: File[] @@ -51,7 +51,7 @@ inputs: doc: | Array of files with per-barcode fragment counts & metrics produced by "cellranger-atac count" command in CSV format - 'sd:upstreamSource': "sc_atacseq_sample/barcode_metrics_report" + "sd:upstreamSource": "sc_atacseq_sample/barcode_metrics_report" indices_folder: type: Directory @@ -64,13 +64,13 @@ inputs: This sample can be obtained from "Cell Ranger Reference (RNA, ATAC, RNA+ATAC)" pipeline. - 'sd:upstreamSource': "genome_indices/arc_indices_folder" - 'sd:localLabel': true + "sd:upstreamSource": "genome_indices/arc_indices_folder" + "sd:localLabel": true memory_limit: type: int? default: 20 - 'sd:upstreamSource': "genome_indices/memory_limit" + "sd:upstreamSource": "genome_indices/memory_limit" normalization_mode: type: @@ -80,7 +80,7 @@ inputs: default: "none" label: "Library depth normalization mode" doc: "Library depth normalization mode" - 'sd:layout': + "sd:layout": advanced: true threads: @@ -94,14 +94,14 @@ inputs: - "4" - "5" - "6" - default: "4" + default: "6" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. Default: 4 - 'sd:layout': + "sd:layout": advanced: true @@ -113,9 +113,9 @@ outputs: label: "Run summary metrics and charts in HTML format" doc: | Run summary metrics and charts in HTML format - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" metrics_summary_report_json: @@ -243,9 +243,9 @@ outputs: label: "CellBrowser formatted Cellranger report" doc: | CellBrowser formatted Cellranger report - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" diff --git a/workflows/cellranger-atac-count.cwl b/workflows/cellranger-atac-count.cwl index b2e97921..9a954f6b 100644 --- a/workflows/cellranger-atac-count.cwl +++ b/workflows/cellranger-atac-count.cwl @@ -9,7 +9,7 @@ requirements: - class: MultipleInputFeatureRequirement -'sd:upstream': +"sd:upstream": genome_indices: - "cellranger-mkref.cwl" @@ -18,7 +18,7 @@ inputs: alias: type: string - label: "Experiment short name/Alias" + label: "Experiment short name/alias" sd:preview: position: 1 @@ -33,13 +33,13 @@ inputs: This sample can be obtained from "Cell Ranger Reference (RNA, ATAC, RNA+ATAC)" pipeline. - 'sd:upstreamSource': "genome_indices/arc_indices_folder" - 'sd:localLabel': true + "sd:upstreamSource": "genome_indices/arc_indices_folder" + "sd:localLabel": true memory_limit: type: int? default: 20 - 'sd:upstreamSource': "genome_indices/memory_limit" + "sd:upstreamSource": "genome_indices/memory_limit" fastq_file_r1: type: @@ -73,7 +73,7 @@ inputs: Define the top N barcodes with the most ATAC fragments overlapping peaks as cells. N must be a positive integer <= 20,000. Please consult the documentation before using this option - 'sd:layout': + "sd:layout": advanced: true threads: @@ -87,14 +87,14 @@ inputs: - "4" - "5" - "6" - default: "4" + default: "6" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. Default: 4 - 'sd:layout': + "sd:layout": advanced: true @@ -106,9 +106,9 @@ outputs: label: "FastqQC report for FASTQ file R1" doc: | FastqQC report for FASTQ file R1 - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" fastqc_report_fastq_r2: @@ -117,9 +117,9 @@ outputs: label: "FastqQC report for FASTQ file R2" doc: | FastqQC report for FASTQ file R2 - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" fastqc_report_fastq_r3: @@ -128,9 +128,9 @@ outputs: label: "FastqQC report for FASTQ file R3" doc: | FastqQC report for FASTQ file R3 - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" web_summary_report: @@ -139,9 +139,9 @@ outputs: label: "Cell Ranger summary" doc: | Run summary metrics and charts in HTML format - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" metrics_summary_report_json: @@ -326,19 +326,19 @@ outputs: outputSource: collect_statistics/collected_statistics_md label: "Collected statistics in Markdown format" doc: "Collected statistics in Markdown format" - 'sd:visualPlugins': + "sd:visualPlugins": - markdownView: - tab: 'Overview' + tab: "Overview" collected_statistics_tsv: type: File outputSource: collect_statistics/collected_statistics_tsv label: "Collected statistics in TSV format" doc: "Collected statistics in TSV format" - 'sd:visualPlugins': + "sd:visualPlugins": - tableView: vertical: true - tab: 'Overview' + tab: "Overview" html_data_folder: type: Directory @@ -353,9 +353,9 @@ outputs: label: "CellBrowser formatted Cellranger report" doc: | CellBrowser formatted Cellranger report - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" diff --git a/workflows/cellranger-mkref.cwl b/workflows/cellranger-mkref.cwl index 73359ee8..013e1ad1 100644 --- a/workflows/cellranger-mkref.cwl +++ b/workflows/cellranger-mkref.cwl @@ -9,9 +9,10 @@ requirements: - class: MultipleInputFeatureRequirement -'sd:upstream': +"sd:upstream": genome_indices: - "genome-indices.cwl" + - "https://github.com/datirium/workflows/workflows/genome-indices.cwl" inputs: @@ -29,12 +30,12 @@ inputs: Genome type to be used for generating reference genome indices - 'sd:upstreamSource': "genome_indices/fasta_output" - 'sd:localLabel': true + "sd:upstreamSource": "genome_indices/fasta_output" + "sd:localLabel": true annotation_gtf_file: type: File - 'sd:upstreamSource': "genome_indices/annotation_gtf" + "sd:upstreamSource": "genome_indices/annotation_gtf" memory_limit: type: int? @@ -43,7 +44,7 @@ inputs: doc: | Maximum memory used (GB). The same will be applied to virtual memory - 'sd:layout': + "sd:layout": advanced: true threads: @@ -57,14 +58,14 @@ inputs: - "4" - "5" - "6" - default: "4" + default: "6" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. Default: 4 - 'sd:layout': + "sd:layout": advanced: true diff --git a/workflows/cellranger-mkvdjref.cwl b/workflows/cellranger-mkvdjref.cwl index f9d23f7d..439e2618 100644 --- a/workflows/cellranger-mkvdjref.cwl +++ b/workflows/cellranger-mkvdjref.cwl @@ -13,7 +13,7 @@ inputs: alias: type: string - label: "Experiment short name/Alias" + label: "Experiment short name/alias" sd:preview: position: 1 diff --git a/workflows/cellranger-multi.cwl b/workflows/cellranger-multi.cwl index 0ca5b3a3..76c73f56 100644 --- a/workflows/cellranger-multi.cwl +++ b/workflows/cellranger-multi.cwl @@ -9,7 +9,7 @@ requirements: - class: MultipleInputFeatureRequirement -'sd:upstream': +"sd:upstream": gex_indices: - "cellranger-mkref.cwl" vdj_indices: @@ -20,7 +20,7 @@ inputs: alias: type: string - label: "Experiment short name/Alias" + label: "Experiment short name/alias" sd:preview: position: 1 @@ -35,8 +35,8 @@ inputs: This sample can be obtained from "Cell Ranger Reference (RNA, ATAC, RNA+ATAC)" pipeline. - 'sd:upstreamSource': "gex_indices/indices_folder" - 'sd:localLabel': true + "sd:upstreamSource": "gex_indices/indices_folder" + "sd:localLabel": true memory_limit: type: int? @@ -53,8 +53,8 @@ inputs: assembly and clonotype calling. This sample can be obtained from "Cell Ranger Reference (VDJ)" pipeline. - 'sd:upstreamSource': "vdj_indices/indices_folder" - 'sd:localLabel': true + "sd:upstreamSource": "vdj_indices/indices_folder" + "sd:localLabel": true gex_fastq_file_r1: type: @@ -109,7 +109,7 @@ inputs: Auto-detection does not work for TRG/D (gamma-delta) chains. Note that gamma-delta analysis is enabled but the algorithm has not been tested extensively. - 'sd:layout': + "sd:layout": advanced: true threads: @@ -123,7 +123,7 @@ inputs: - "4" - "5" - "6" - default: "4" + default: "6" label: "Cores/CPUs" doc: | Parallelization parameter to define the @@ -142,9 +142,9 @@ outputs: label: "FastqQC report for GEX FASTQ file R1" doc: | FastqQC report for GEX FASTQ file R1 - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" fastqc_report_gex_fastq_r2: @@ -153,9 +153,9 @@ outputs: label: "FastqQC report for GEX FASTQ file R2" doc: | FastqQC report for GEX FASTQ file R2 - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" fastqc_report_vdj_fastq_r1: @@ -164,9 +164,9 @@ outputs: label: "FastqQC report for V(D)J FASTQ file R1" doc: | FastqQC report for V(D)J FASTQ file R1 - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" fastqc_report_vdj_fastq_r2: @@ -175,9 +175,9 @@ outputs: label: "FastqQC report for V(D)J FASTQ file R2" doc: | FastqQC report for V(D)J FASTQ file R2 - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" web_summary_report: @@ -186,9 +186,9 @@ outputs: label: "Gene Expression and V(D)J Repertoire Profiling" doc: | Run summary metrics and charts in HTML format - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" metrics_summary_report: @@ -197,10 +197,10 @@ outputs: label: "Run summary metrics in TSV format" doc: | Run summary metrics in TSV format - 'sd:visualPlugins': + "sd:visualPlugins": - syncfusiongrid: - tab: 'QC metrics' - Title: 'QC metrics' + tab: "QC metrics" + Title: "QC metrics" possorted_genome_bam_bai: type: File @@ -332,10 +332,10 @@ outputs: artifacts and filtered out, meaning that they are no longer called as cells. However, as clonotype grouping stage is hapenning before forming the final version of files in the per_sample_outs folder, the reported cells number won't be affected. - 'sd:visualPlugins': + "sd:visualPlugins": - syncfusiongrid: - tab: 'V(D)J clonotypes' - Title: 'V(D)J clonotypes' + tab: "V(D)J clonotypes" + Title: "V(D)J clonotypes" germline_contigs_bam_bai: type: File @@ -428,9 +428,9 @@ outputs: label: "UCSC Cell Browser" doc: | CellBrowser formatted Cellranger report - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" cellranger_multi_stdout_log: diff --git a/workflows/cellranger-reanalyze.cwl b/workflows/cellranger-reanalyze.cwl index cb34623b..62fedddd 100644 --- a/workflows/cellranger-reanalyze.cwl +++ b/workflows/cellranger-reanalyze.cwl @@ -9,7 +9,7 @@ requirements: - class: MultipleInputFeatureRequirement -'sd:upstream': +"sd:upstream": sc_experiment: - "single-cell-preprocess-cellranger.cwl" - "cellranger-multi.cwl" @@ -19,7 +19,7 @@ inputs: alias: type: string - label: "Experiment short name/Alias" + label: "Experiment short name/alias" sd:preview: position: 1 @@ -27,8 +27,8 @@ inputs: type: File label: "Single-cell Experiment" doc: "Filtered feature-barcode matrices in HDF5 format from cellranger count/multi" - 'sd:upstreamSource': "sc_experiment/filtered_feature_bc_matrix_h5" - 'sd:localLabel': true + "sd:upstreamSource": "sc_experiment/filtered_feature_bc_matrix_h5" + "sd:localLabel": true selected_barcodes: type: File? @@ -66,7 +66,7 @@ inputs: Use this if the number of cells estimated by Cell Ranger is not consistent with the barcode rank plot. If specifying a value that exceeds the original cell count, you must use the raw_gene_bc_matrices_h5.h5 - 'sd:layout': + "sd:layout": advanced: true num_analysis_bcs: @@ -78,7 +78,7 @@ inputs: want to improve performance or simulate results from lower cell counts. Cannot be set higher than the available number of cells. Default: null - 'sd:layout': + "sd:layout": advanced: true num_pca_bcs: @@ -91,7 +91,7 @@ inputs: will still reflect all the data. Try reducing this parameter if your analysis is running out of memory. Cannot be set higher than the available number of cells. Default: null - 'sd:layout': + "sd:layout": advanced: true num_pca_genes: @@ -104,7 +104,7 @@ inputs: your analysis is running out of memory. Cannot be set higher than the number of genes in the reference transcriptome. Default: null - 'sd:layout': + "sd:layout": advanced: true num_principal_comps: @@ -116,7 +116,7 @@ inputs: to be called. The default value is 100 when the chemistry batch correction is enabled. Set from 10 to 100, depending on the number of cell populations/clusters you expect to see. Default: 10 - 'sd:layout': + "sd:layout": advanced: true cbc_knn: @@ -128,7 +128,7 @@ inputs: Setting this too high will increase runtime and may cause out of memory error. See Chemistry Batch Correction page for more details. Ranges from 5 to 20. Default: 10 - 'sd:layout': + "sd:layout": advanced: true cbc_alpha: @@ -140,7 +140,7 @@ inputs: which is used to determine if the batch pair will be merged. See Chemistry Batch Correction page for more details. Ranges from 0.05 to 0.5. Default: 0.1 - 'sd:layout': + "sd:layout": advanced: true cbc_sigma: @@ -152,7 +152,7 @@ inputs: vector for each cell. See Chemistry Batch Correction page for more details. Ranges from 10 to 500. Default: 150 - 'sd:layout': + "sd:layout": advanced: true cbc_realign_panorama: @@ -164,7 +164,7 @@ inputs: this to True will usually improve the performance, but will also increase runtime and memory usage. See Chemistry Batch Correction page for more details. One of true or false. Default: false - 'sd:layout': + "sd:layout": advanced: true graphclust_neighbors: @@ -177,7 +177,7 @@ inputs: value and that determined by neighbor_a and neighbor_b. Set this value to zero to use those values instead. Ranged from 10 to 500, depending on desired granularity. Default: 0 - 'sd:layout': + "sd:layout": advanced: true neighbor_a: @@ -189,7 +189,7 @@ inputs: k = neighbor_a + neighbor_b * log10(n_cells). The actual number of neighbors used is the maximum of this value and graphclust_neighbors. Determines how clustering granularity scales with cell count. Default: -230.0 - 'sd:layout': + "sd:layout": advanced: true neighbor_b: @@ -201,7 +201,7 @@ inputs: k = neighbor_a + neighbor_b * log10(n_cells). The actual number of neighbors used is the maximum of this value and graphclust_neighbors. Determines how clustering granularity scales with cell count. Default: 120.0 - 'sd:layout': + "sd:layout": advanced: true max_clusters: @@ -212,7 +212,7 @@ inputs: Compute K-means clustering using K values of 2 to N. Setting this too high may cause spurious clusters to be called. Ranges from 10 to 50, depending on the number of cell populations / clusters you expect to see. Default: 10 - 'sd:layout': + "sd:layout": advanced: true tsne_input_pcs: @@ -225,7 +225,7 @@ inputs: is faster and/or the output looks better when using fewer PCs. Cannot be set higher than the num_principal_comps parameter. Default: null - 'sd:layout': + "sd:layout": advanced: true tsne_perplexity: @@ -236,7 +236,7 @@ inputs: TSNE perplexity parameter (see the TSNE FAQ for more details). When analyzing 100k+ cells, increasing this parameter may improve TSNE results, but the algorithm will be slower. Ranges from 30 to 50. Default: 30 - 'sd:layout': + "sd:layout": advanced: true tsne_theta: @@ -248,7 +248,7 @@ inputs: (and vice versa). The runtime and memory performance of TSNE will increase dramatically if you set this below 0.25. Ranges from 0 to 1. Default: 0.5 - 'sd:layout': + "sd:layout": advanced: true tsne_max_dims: @@ -259,7 +259,7 @@ inputs: Maximum number of TSNE output dimensions. Set this to 3 to produce both 2D and 3D TSNE projections (note: runtime will increase significantly). Ranges from 2 to 3. Default: 2 - 'sd:layout': + "sd:layout": advanced: true tsne_max_iter: @@ -270,7 +270,7 @@ inputs: Number of total TSNE iterations. Try increasing this if TSNE results do not look good on larger numbers of cells. Runtime increases linearly with number of iterations. Ranges from 1000 to 10000. Default: 1000 - 'sd:layout': + "sd:layout": advanced: true tsne_stop_lying_iter: @@ -281,7 +281,7 @@ inputs: Iteration at which TSNE learning rate is reduced. Try increasing this if TSNE results do not look good on larger numbers of cells. Cannot be set higher than tsne_max_iter. Default: 250 - 'sd:layout': + "sd:layout": advanced: true tsne_mom_switch_iter: @@ -292,7 +292,7 @@ inputs: Iteration at which TSNE momentum is reduced. Try increasing this if TSNE results do not look good on larger numbers of cells. Cannot be set higher than tsne_max_iter. Cannot be set higher than tsne_max_iter. Default: 250 - 'sd:layout': + "sd:layout": advanced: true umap_input_pcs: @@ -305,7 +305,7 @@ inputs: UMAP is faster and/or the output looks better when using fewer PCs. Cannot be set higher than the num_principal_comps parameter. Default: null - 'sd:layout': + "sd:layout": advanced: true umap_n_neighbors: @@ -317,7 +317,7 @@ inputs: Larger values will usually result in more global structure at the loss of detailed local structure. Ranges from 5 to 50. Default: 30 - 'sd:layout': + "sd:layout": advanced: true umap_max_dims: @@ -328,7 +328,7 @@ inputs: Maximum number of UMAP output dimensions. Set this to 3 to produce both 2D and 3D UMAP projections. Ranges from 2 to 3. Default: 2 - 'sd:layout': + "sd:layout": advanced: true umap_min_dist: @@ -340,7 +340,7 @@ inputs: points are more evenly distributed, while smaller values make the embedding more accurately with regard to the local structure. Ranges from 0.001 to 0.5. Default: 0.3 - 'sd:layout': + "sd:layout": advanced: true umap_metric: @@ -374,7 +374,7 @@ inputs: doc: | Determines how the distance is computed in the input space. Default: "correlation" - 'sd:layout': + "sd:layout": advanced: true random_seed: @@ -386,15 +386,15 @@ inputs: different results. If the TSNE or UMAP results don't look good, try running multiple times with different seeds and pick the TSNE or UMAP that looks best. Default: 0 - 'sd:layout': + "sd:layout": advanced: true threads: type: int? - default: 4 + default: 6 label: "Number of threads" doc: "Number of threads for those steps that support multithreading" - 'sd:layout': + "sd:layout": advanced: true memory_limit: @@ -402,7 +402,7 @@ inputs: default: 30 label: "Maximum memory used (GB)" doc: "Maximum memory used (GB). The same will be applied to virtual memory" - 'sd:layout': + "sd:layout": advanced: true @@ -422,9 +422,9 @@ outputs: label: "Reanalyzed run summary metrics and charts in HTML format" doc: | Reanalyzed run summary metrics and charts in HTML format - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" filtered_feature_bc_matrix_folder: @@ -469,9 +469,9 @@ outputs: label: "CellBrowser formatted Cellranger report" doc: | CellBrowser formatted Cellranger report - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" reanalyze_stdout_log: diff --git a/workflows/fastq-download.cwl b/workflows/fastq-download.cwl index 3a3c617a..9fbae3f2 100644 --- a/workflows/fastq-download.cwl +++ b/workflows/fastq-download.cwl @@ -21,7 +21,7 @@ inputs: alias: type: string - label: "Experiment short name/Alias" + label: "Experiment short name/alias" sd:preview: position: 1 @@ -72,7 +72,7 @@ inputs: label: "Optional HTTP proxy settings" doc: | Optional HTTP proxy settings - 'sd:layout': + "sd:layout": advanced: true https_proxy: @@ -80,7 +80,7 @@ inputs: label: "Optional HTTPS proxy settings" doc: | Optional HTTPS proxy settings - 'sd:layout': + "sd:layout": advanced: true @@ -103,9 +103,9 @@ outputs: doc: | Collected report for downloaded FASTQ files in Markdown format - 'sd:visualPlugins': + "sd:visualPlugins": - markdownView: - tab: 'Overview' + tab: "Overview" metadata_xml: type: diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index c7e43413..8be1143c 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -21,7 +21,7 @@ requirements: }; -'sd:upstream': +"sd:upstream": sc_tools_sample: - "sc-atac-cluster.cwl" - "sc-rna-cluster.cwl" @@ -33,6 +33,7 @@ requirements: - "cellranger-atac-count.cwl" - "cellranger-atac-aggr.cwl" + inputs: alias: @@ -51,8 +52,8 @@ inputs: "Single-Cell ATAC-Seq Dimensionality Reduction Analysis" at any of the processing stages. - 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" - 'sd:localLabel': true + "sd:upstreamSource": "sc_tools_sample/seurat_data_rds" + "sd:localLabel": true atac_fragments_file: type: File? @@ -68,8 +69,8 @@ inputs: (RNA+ATAC)", "Cell Ranger Aggregate (RNA+ATAC)", "Cell Ranger Count (ATAC)", or "Cell Ranger Aggregate (ATAC)". - 'sd:upstreamSource': "sc_atac_sample/atac_fragments_file" - 'sd:localLabel': true + "sd:upstreamSource": "sc_atac_sample/atac_fragments_file" + "sd:localLabel": true dimensions: type: int? @@ -160,61 +161,67 @@ inputs: - "4" - "5" - "6" - default: "1" + default: "6" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 1 + Default: 6 "sd:layout": advanced: true outputs: - umap_res_plot_png: + umap_gr_clst_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_atac_cluster/umap_res_plot_png - label: "UMAP, colored by cluster" + outputSource: sc_atac_cluster/umap_gr_clst_res_plot_png + label: "UMAP colored by cluster (all cells)" doc: | - UMAP, colored by cluster - 'sd:visualPlugins': + UMAP colored by cluster. + All cells. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per cluster' - Caption: 'UMAP, colored by cluster' + tab: "Per cluster" + Caption: "UMAP colored by cluster (all cells)" - slh_res_plot_png: + slh_gr_clst_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_atac_cluster/slh_res_plot_png - label: "Silhouette scores" + outputSource: sc_atac_cluster/slh_gr_clst_res_plot_png + label: "Silhouette scores (all cells)" doc: | - Silhouette scores - 'sd:visualPlugins': + Silhouette scores. + All cells. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per cluster' - Caption: 'Silhouette scores' + tab: "Per cluster" + Caption: "Silhouette scores (all cells)" - umap_spl_idnt_res_plot_png: + umap_gr_clst_spl_idnt_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_atac_cluster/umap_spl_idnt_res_plot_png - label: "UMAP, colored by cluster, split by dataset" + outputSource: sc_atac_cluster/umap_gr_clst_spl_idnt_res_plot_png + label: "UMAP colored by cluster (split by dataset, downsampled)" doc: | - UMAP, colored by cluster, - split by dataset - 'sd:visualPlugins': + UMAP colored by cluster. + Split by dataset; downsampled + to the smallest dataset. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by cluster, split by dataset' + tab: "Per dataset" + Caption: "UMAP colored by cluster (split by dataset, downsampled)" cmp_gr_clst_spl_idnt_res_plot_png: type: @@ -222,15 +229,16 @@ outputs: - type: array items: File outputSource: sc_atac_cluster/cmp_gr_clst_spl_idnt_res_plot_png - label: "Composition plot, colored by cluster, split by dataset, downsampled" + label: "Composition plot colored by cluster (split by dataset, downsampled)" doc: | - Composition plot, colored by - cluster, split by dataset, - downsampled - 'sd:visualPlugins': + Composition plot colored by cluster. + Split by dataset; downsampled + to the smallest dataset. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Composition plot, colored by cluster, split by dataset, downsampled' + tab: "Per dataset" + Caption: "Composition plot colored by cluster (split by dataset, downsampled)" cmp_gr_idnt_spl_clst_res_plot_png: type: @@ -238,30 +246,34 @@ outputs: - type: array items: File outputSource: sc_atac_cluster/cmp_gr_idnt_spl_clst_res_plot_png - label: "Composition plot, colored by dataset, split by cluster, downsampled" + label: "Composition plot colored by dataset (split by cluster, downsampled)" doc: | - Composition plot, colored by - dataset, split by cluster, - downsampled - 'sd:visualPlugins': + Composition plot colored by dataset. + Split by cluster; downsampled to the + smallest dataset. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Composition plot, colored by dataset, split by cluster, downsampled' + tab: "Per dataset" + Caption: "Composition plot colored by dataset (split by cluster, downsampled)" - umap_spl_cnd_res_plot_png: + umap_gr_clst_spl_cnd_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_atac_cluster/umap_spl_cnd_res_plot_png - label: "UMAP, colored by cluster, split by grouping condition" + outputSource: sc_atac_cluster/umap_gr_clst_spl_cnd_res_plot_png + label: "UMAP colored by cluster (split by grouping condition, downsampled)" doc: | - UMAP, colored by cluster, split - by grouping condition - 'sd:visualPlugins': + UMAP colored by cluster. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by cluster, split by grouping condition' + tab: "Per group" + Caption: "UMAP colored by cluster (split by grouping condition, downsampled)" cmp_gr_clst_spl_cnd_res_plot_png: type: @@ -269,15 +281,17 @@ outputs: - type: array items: File outputSource: sc_atac_cluster/cmp_gr_clst_spl_cnd_res_plot_png - label: "Composition plot, colored by cluster, split by grouping condition, downsampled" + label: "Composition plot colored by cluster (split by grouping condition, downsampled)" doc: | - Composition plot, colored by - cluster, split by grouping - condition, downsampled - 'sd:visualPlugins': + Composition plot colored by cluster. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'Composition plot, colored by cluster, split by grouping condition, downsampled' + tab: "Per group" + Caption: "Composition plot colored by cluster (split by grouping condition, downsampled)" cmp_gr_cnd_spl_clst_res_plot_png: type: @@ -285,15 +299,17 @@ outputs: - type: array items: File outputSource: sc_atac_cluster/cmp_gr_cnd_spl_clst_res_plot_png - label: "Composition plot, colored by grouping condition, split by cluster, downsampled" + label: "Composition plot colored by grouping condition (split by cluster, downsampled)" doc: | - Composition plot, colored by - grouping condition, split by - cluster, downsampled - 'sd:visualPlugins': + Composition plot colored by grouping condition. + Split by cluster; first downsampled to the + smallest dataset, then downsampled to the + smallest group. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'Composition plot, colored by grouping condition, split by cluster, downsampled' + tab: "Per group" + Caption: "Composition plot colored by grouping condition (split by cluster, downsampled)" cvrg_res_plot_png: type: @@ -301,39 +317,41 @@ outputs: - type: array items: File outputSource: sc_atac_cluster/cvrg_res_plot_png - label: "ATAC fragments coverage" + label: "ATAC fragment coverage (per gene)" doc: | - ATAC fragments coverage - 'sd:visualPlugins': + ATAC fragment coverage. + All genes of interest. + PNG format. + "sd:visualPlugins": - image: - tab: 'Genome coverage' - Caption: 'ATAC fragments coverage' + tab: "Genome coverage" + Caption: "ATAC fragment coverage (per gene)" peak_markers_tsv: type: File? outputSource: sc_atac_cluster/peak_markers_tsv - label: "Peak markers per cluster for all resolutions" + label: "Peak markers" doc: | - Peak markers per cluster for all resolutions - 'sd:visualPlugins': + Peak markers. + TSV format. + "sd:visualPlugins": - syncfusiongrid: - tab: 'Peak markers' - Title: 'Peak markers per cluster for all resolutions' + tab: "Peak markers" + Title: "Peak markers" ucsc_cb_html_data: type: Directory? outputSource: sc_atac_cluster/ucsc_cb_html_data - label: "UCSC Cell Browser data" + label: "UCSC Cell Browser (data)" doc: | - Directory with UCSC Cell Browser - data + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputSource: sc_atac_cluster/ucsc_cb_html_file label: "UCSC Cell Browser" doc: | - UCSC Cell Browser HTML index file + UCSC Cell Browser html index. "sd:visualPlugins": - linkList: tab: "Overview" @@ -342,31 +360,31 @@ outputs: seurat_data_rds: type: File outputSource: sc_atac_cluster/seurat_data_rds - label: "Processed Seurat data in RDS format" + label: "Seurat object in RDS format" doc: | - Processed Seurat data in RDS format + Seurat object. + RDS format. pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder - label: "Plots in PDF format" + label: "Compressed folder with all PDF plots" doc: | - Compressed folder with plots - in PDF format + Compressed folder with all PDF plots. sc_atac_cluster_stdout_log: type: File outputSource: sc_atac_cluster/stdout_log - label: "stdout log generated by sc_atac_cluster step" + label: "Output log" doc: | - stdout log generated by sc_atac_cluster step + Stdout log from the sc_atac_cluster step. sc_atac_cluster_stderr_log: type: File outputSource: sc_atac_cluster/stderr_log - label: "stderr log generated by sc_atac_cluster step" + label: "Error log" doc: | - stderr log generated by sc_atac_cluster step + Stderr log from the sc_atac_cluster step. steps: @@ -410,21 +428,21 @@ steps: source: threads valueFrom: $(parseInt(self)) out: - - umap_res_plot_png - - slh_res_plot_png - - umap_spl_idnt_res_plot_png + - umap_gr_clst_res_plot_png + - slh_gr_clst_res_plot_png + - umap_gr_clst_spl_idnt_res_plot_png - cmp_gr_clst_spl_idnt_res_plot_png - cmp_gr_idnt_spl_clst_res_plot_png - - umap_spl_cnd_res_plot_png + - umap_gr_clst_spl_cnd_res_plot_png - cmp_gr_clst_spl_cnd_res_plot_png - cmp_gr_cnd_spl_clst_res_plot_png - cvrg_res_plot_png - - umap_res_plot_pdf - - slh_res_plot_pdf - - umap_spl_idnt_res_plot_pdf + - umap_gr_clst_res_plot_pdf + - slh_gr_clst_res_plot_pdf + - umap_gr_clst_spl_idnt_res_plot_pdf - cmp_gr_clst_spl_idnt_res_plot_pdf - cmp_gr_idnt_spl_clst_res_plot_pdf - - umap_spl_cnd_res_plot_pdf + - umap_gr_clst_spl_cnd_res_plot_pdf - cmp_gr_clst_spl_cnd_res_plot_pdf - cmp_gr_cnd_spl_clst_res_plot_pdf - cvrg_res_plot_pdf @@ -440,12 +458,12 @@ steps: in: input_files: source: - - sc_atac_cluster/umap_res_plot_pdf - - sc_atac_cluster/slh_res_plot_pdf - - sc_atac_cluster/umap_spl_idnt_res_plot_pdf + - sc_atac_cluster/umap_gr_clst_res_plot_pdf + - sc_atac_cluster/slh_gr_clst_res_plot_pdf + - sc_atac_cluster/umap_gr_clst_spl_idnt_res_plot_pdf - sc_atac_cluster/cmp_gr_clst_spl_idnt_res_plot_pdf - sc_atac_cluster/cmp_gr_idnt_spl_clst_res_plot_pdf - - sc_atac_cluster/umap_spl_cnd_res_plot_pdf + - sc_atac_cluster/umap_gr_clst_spl_cnd_res_plot_pdf - sc_atac_cluster/cmp_gr_clst_spl_cnd_res_plot_pdf - sc_atac_cluster/cmp_gr_cnd_spl_clst_res_plot_pdf - sc_atac_cluster/cvrg_res_plot_pdf diff --git a/workflows/sc-atac-coverage.cwl b/workflows/sc-atac-coverage.cwl index 5d2ba00c..14ddccde 100644 --- a/workflows/sc-atac-coverage.cwl +++ b/workflows/sc-atac-coverage.cwl @@ -17,7 +17,7 @@ requirements: }; -'sd:upstream': +"sd:upstream": sc_tools_sample: - "sc-multiome-filter.cwl" - "sc-atac-reduce.cwl" @@ -31,7 +31,7 @@ requirements: - "cellranger-atac-aggr.cwl" genome_indices: - "genome-indices.cwl" - - "https://github.com/datirium/workflows/workflows/genome-indices.cwl" + inputs: @@ -48,8 +48,8 @@ inputs: Path to the RDS file to load Seurat object from. This file should include chromatin accessibility information stored in the ATAC assay with a proper seqinfo data. - 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" - 'sd:localLabel': true + "sd:upstreamSource": "sc_tools_sample/seurat_data_rds" + "sd:localLabel": true atac_fragments_file: type: File @@ -64,16 +64,16 @@ inputs: Ranger Count (RNA+ATAC)", "Cell Ranger Aggregate (RNA+ATAC)", "Cell Ranger Count (ATAC)", or "Cell Ranger Aggregate (ATAC)". - 'sd:upstreamSource': "sc_atac_sample/atac_fragments_file" - 'sd:localLabel': true + "sd:upstreamSource": "sc_atac_sample/atac_fragments_file" + "sd:localLabel": true chrom_length_file: # not used - need it only for IGV type: File label: "Genome" doc: | Reference genome - 'sd:upstreamSource': "genome_indices/chrom_length" - 'sd:localLabel': true + "sd:upstreamSource": "genome_indices/chrom_length" + "sd:localLabel": true splitby: type: string? @@ -113,7 +113,7 @@ inputs: doc: | Distance in bp to flank both start and end of the each fragment in both direction to generate cut sites coverage. Default: 5 - 'sd:layout': + "sd:layout": advanced: true threads: @@ -127,14 +127,14 @@ inputs: - "4" - "5" - "6" - default: "1" + default: "6" label: "Number of cores/cpus to use" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 1 - 'sd:layout': + Default: 6 + "sd:layout": advanced: true @@ -147,12 +147,12 @@ outputs: doc: | Locations of open-chromatin regions ("peaks") in bigBed format - 'sd:visualPlugins': + "sd:visualPlugins": - igvbrowser: - tab: 'Genome Browser' - id: 'igvbrowser' - type: 'annotation' - format: 'bigbed' + tab: "Genome Browser" + id: "igvbrowser" + type: "annotation" + format: "bigbed" name: "Peaks" height: 40 @@ -166,11 +166,11 @@ outputs: doc: | Genome coverage calculated for Tn5 cut sites in bigWig format - 'sd:visualPlugins': + "sd:visualPlugins": - igvbrowser: - tab: 'Genome Browser' - id: 'igvbrowser' - type: 'wig' + tab: "Genome Browser" + id: "igvbrowser" + type: "wig" name: "Cut sites coverage" height: 120 @@ -184,11 +184,11 @@ outputs: doc: | Genome coverage calculated for ATAC fragments in bigWig format - 'sd:visualPlugins': + "sd:visualPlugins": - igvbrowser: - tab: 'Genome Browser' - id: 'igvbrowser' - type: 'wig' + tab: "Genome Browser" + id: "igvbrowser" + type: "wig" name: "ATAC fragments coverage" height: 120 @@ -198,23 +198,23 @@ outputs: doc: | Markdown file to explain the tracks order for IGV outputSource: create_metadata/output_file - 'sd:visualPlugins': + "sd:visualPlugins": - markdownView: - tab: 'Overview' + tab: "Overview" sc_atac_coverage_stdout_log: type: File outputSource: sc_atac_coverage/stdout_log - label: "stdout log generated by sc_atac_coverage step" + label: "Output log" doc: | - stdout log generated by sc_atac_coverage step + Stdout log from the sc_atac_coverage step. - sc_atac_reduce_stderr_log: + sc_atac_coverage_stderr_log: type: File outputSource: sc_atac_coverage/stderr_log - label: "stderr log generated by sc_atac_coverage step" + label: "Error log" doc: | - stderr log generated by sc_atac_coverage step + Stderr log from the sc_atac_coverage step. steps: diff --git a/workflows/sc-atac-dbinding.cwl b/workflows/sc-atac-dbinding.cwl index ab1782ba..5699e0c6 100644 --- a/workflows/sc-atac-dbinding.cwl +++ b/workflows/sc-atac-dbinding.cwl @@ -24,7 +24,7 @@ requirements: }; -'sd:upstream': +"sd:upstream": sc_tools_sample: - "sc-atac-cluster.cwl" - "sc-wnn-cluster.cwl" @@ -36,7 +36,6 @@ requirements: - "cellranger-atac-aggr.cwl" genome_indices: - "genome-indices.cwl" - - "https://github.com/datirium/workflows/workflows/genome-indices.cwl" inputs: @@ -56,8 +55,8 @@ inputs: information stored in the ATAC assay. Additionally 'rnaumap', and/or 'atacumap', and/or 'wnnumap' dimensionality reductions should be present. - 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" - 'sd:localLabel': true + "sd:upstreamSource": "sc_tools_sample/seurat_data_rds" + "sd:localLabel": true atac_fragments_file: type: File @@ -72,16 +71,16 @@ inputs: Ranger Count (RNA+ATAC)", "Cell Ranger Aggregate (RNA+ATAC)", "Cell Ranger Count (ATAC)", or "Cell Ranger Aggregate (ATAC)". - 'sd:upstreamSource': "sc_atac_sample/atac_fragments_file" - 'sd:localLabel': true + "sd:upstreamSource": "sc_atac_sample/atac_fragments_file" + "sd:localLabel": true genome_type: type: string label: "Genome" doc: | Reference genome - 'sd:upstreamSource': "genome_indices/genome" - 'sd:localLabel': true + "sd:upstreamSource": "genome_indices/genome" + "sd:localLabel": true datasets_metadata: type: File? @@ -183,7 +182,7 @@ inputs: label: "Maximum adjusted P-value to show in IGV" doc: | In the exploratory visualization part of the analysis - output only differentially bound peaks with adjusted + output only differentially accessible regions with adjusted P-value not bigger than this value. Default: 0.05 minimum_logfc: @@ -192,7 +191,7 @@ inputs: label: "Maximum log2 Fold Change value to show in IGV" doc: | In the exploratory visualization part of the analysis - output only differentially bound peaks with log2 Fold + output only differentially accessible regions with log2 Fold Change not smaller than this value. Default: 1.0 blacklist_regions_file: @@ -213,7 +212,7 @@ inputs: doc: | Minimum FDR (q-value) cutoff for MACS2 peak detection. Ignored if --test is not set to manorm2. Default: 0.05 - 'sd:layout': + "sd:layout": advanced: true minimum_peak_gap: @@ -225,7 +224,7 @@ inputs: provided value they will be merged before splitting them into reference genomic bins of size --binsize. Ignored if --test is not set to manorm2. Default: 150 - 'sd:layout': + "sd:layout": advanced: true bin_size: @@ -237,7 +236,7 @@ inputs: used by MAnorm2 when generating a table of reads counts per peaks. Ignored if --test is not set to manorm2. Default: 1000 - 'sd:layout': + "sd:layout": advanced: true maximum_peaks: @@ -249,7 +248,7 @@ inputs: qvalue) peaks to keep from each group of cells when constructing reference genomic bins. Ignored if --test is not set to manorm2. Default: keep all peaks - 'sd:layout': + "sd:layout": advanced: true threads: @@ -263,14 +262,14 @@ inputs: - "4" - "5" - "6" - default: "1" + default: "6" label: "Number of cores/cpus to use" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 1 - 'sd:layout': + Default: 6 + "sd:layout": advanced: true @@ -285,10 +284,10 @@ outputs: optionally subsetted to the specific group (rnaumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Overall' - Caption: 'Cells RNA UMAP split by selected criteria' + tab: "Overall" + Caption: "Cells RNA UMAP split by selected criteria" umap_rd_atacumap_plot_png: type: File? @@ -299,10 +298,10 @@ outputs: optionally subsetted to the specific group (atacumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Overall' - Caption: 'Cells ATAC UMAP split by selected criteria' + tab: "Overall" + Caption: "Cells ATAC UMAP split by selected criteria" umap_rd_wnnumap_plot_png: type: File? @@ -313,22 +312,22 @@ outputs: optionally subsetted to the specific group (atacumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Overall' - Caption: 'Cells WNN UMAP split by selected criteria' + tab: "Overall" + Caption: "Cells WNN UMAP split by selected criteria" dbnd_vlcn_plot_png: type: File? outputSource: sc_atac_dbinding/dbnd_vlcn_plot_png - label: "Volcano plot of differentially bound sites" + label: "Volcano plot of differentially accessible regions" doc: | - Volcano plot of differentially bound sites. + Volcano plot of differentially accessible regions. PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Overall' - Caption: 'Volcano plot of differentially bound sites' + tab: "Overall" + Caption: "Volcano plot of differentially accessible regions" seurat_peaks_bigbed_file: type: File @@ -338,12 +337,12 @@ outputs: Peaks in bigBed format extracted from the loaded from provided RDS file Seurat object. - 'sd:visualPlugins': + "sd:visualPlugins": - igvbrowser: - tab: 'Genome Browser' - id: 'igvbrowser' - type: 'annotation' - format: 'bigbed' + tab: "Genome Browser" + id: "igvbrowser" + type: "annotation" + format: "bigbed" name: "Seurat peaks" height: 40 @@ -356,11 +355,11 @@ outputs: for ATAC fragments from the cells that belong to the group defined by the --first and --groupby parameters. - 'sd:visualPlugins': + "sd:visualPlugins": - igvbrowser: - tab: 'Genome Browser' - id: 'igvbrowser' - type: 'wig' + tab: "Genome Browser" + id: "igvbrowser" + type: "wig" name: "ATAC fragments coverage (first)" height: 120 @@ -373,11 +372,11 @@ outputs: for ATAC fragments from the cells that belong to the group defined by the --second and --groupby parameters. - 'sd:visualPlugins': + "sd:visualPlugins": - igvbrowser: - tab: 'Genome Browser' - id: 'igvbrowser' - type: 'wig' + tab: "Genome Browser" + id: "igvbrowser" + type: "wig" name: "ATAC fragments coverage (second)" height: 120 @@ -390,11 +389,11 @@ outputs: for Tn5 cut sites from the cells that belong to the group defined by the --first and --groupby parameters. - 'sd:visualPlugins': + "sd:visualPlugins": - igvbrowser: - tab: 'Genome Browser' - id: 'igvbrowser' - type: 'wig' + tab: "Genome Browser" + id: "igvbrowser" + type: "wig" name: "Tn5 coverage (first)" height: 120 @@ -407,11 +406,11 @@ outputs: for Tn5 cut sites from the cells that belong to the group defined by the --second and --groupby parameters. - 'sd:visualPlugins': + "sd:visualPlugins": - igvbrowser: - tab: 'Genome Browser' - id: 'igvbrowser' - type: 'wig' + tab: "Genome Browser" + id: "igvbrowser" + type: "wig" name: "Tn5 coverage (second)" height: 120 @@ -444,11 +443,11 @@ outputs: from the Tn5 cut sites of the cells that belong to the group defined by the --first and --groupby parameters. - 'sd:visualPlugins': + "sd:visualPlugins": - igvbrowser: - tab: 'Genome Browser' - id: 'igvbrowser' - type: 'annotation' + tab: "Genome Browser" + id: "igvbrowser" + type: "annotation" name: "Called peaks (first)" displayMode: "COLLAPSE" height: 40 @@ -462,11 +461,11 @@ outputs: from the Tn5 cut sites of the cells that belong to the group defined by the --second and --groupby parameters. - 'sd:visualPlugins': + "sd:visualPlugins": - igvbrowser: - tab: 'Genome Browser' - id: 'igvbrowser' - type: 'annotation' + tab: "Genome Browser" + id: "igvbrowser" + type: "annotation" name: "Called peaks (second)" displayMode: "COLLAPSE" height: 40 @@ -494,63 +493,63 @@ outputs: diff_bound_sites: type: File outputSource: sc_atac_dbinding/diff_bound_sites - label: "Differentially bound sites" + label: "Differentially accessible regions" doc: | - Not filtered differentially bound sites + Not filtered differentially accessible regions in TSV format - 'sd:visualPlugins': + "sd:visualPlugins": - syncfusiongrid: - tab: 'Diff bound sites' - Title: 'Differentially bound sites. Not filtered' + tab: "Diff. accessible regions" + Title: "Differentially accessible regions. Not filtered" diff_bound_sites_with_labels: type: File outputSource: add_label_column/output_file - label: "Differentially bound sites with labels" + label: "Differentially accessible regions with labels" doc: | - Not filtered differentially bound sites + Not filtered differentially accessible regions with labels in TSV format first_enrch_bigbed_file: type: File? outputSource: sc_atac_dbinding/first_enrch_bigbed_file - label: "Significant differentially bound sites (first)" + label: "Significant differentially accessible regions (first)" doc: | Peaks in bigBed format filtered by --padj and --logfc thresholds enriched in the group of cells defined by the --first and --groupby parameters. - 'sd:visualPlugins': + "sd:visualPlugins": - igvbrowser: - tab: 'Genome Browser' - id: 'igvbrowser' - type: 'annotation' - format: 'bigbed' - name: "Diff. bound sites (first)" + tab: "Genome Browser" + id: "igvbrowser" + type: "annotation" + format: "bigbed" + name: "Diff. accessible regions (first)" height: 40 second_enrch_bigbed_file: type: File? outputSource: sc_atac_dbinding/second_enrch_bigbed_file - label: "Significant differentially bound sites (second)" + label: "Significant differentially accessible regions (second)" doc: | Peaks in bigBed format filtered by --padj and --logfc thresholds enriched in the group of cells defined by the --second and --groupby parameters. - 'sd:visualPlugins': + "sd:visualPlugins": - igvbrowser: - tab: 'Genome Browser' - id: 'igvbrowser' - type: 'annotation' - format: 'bigbed' - name: "Diff. bound sites (second)" + tab: "Genome Browser" + id: "igvbrowser" + type: "annotation" + format: "bigbed" + name: "Diff. accessible regions (second)" height: 40 first_enrch_bed_file: type: File? outputSource: sc_atac_dbinding/first_enrch_bed_file - label: "Significant differentially bound sites (first)" + label: "Significant differentially accessible regions (first)" doc: | Peaks in BED format filtered by --padj and --logfc thresholds enriched @@ -560,7 +559,7 @@ outputs: second_enrch_bed_file: type: File? outputSource: sc_atac_dbinding/second_enrch_bed_file - label: "Significant differentially bound sites (second)" + label: "Significant differentially accessible regions (second)" doc: | Peaks in BED format filtered by --padj and --logfc thresholds enriched @@ -573,9 +572,9 @@ outputs: label: "Volcano Plot" doc: | HTML index file for Volcano Plot - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" volcano_plot_html_data: @@ -601,34 +600,33 @@ outputs: label: "Tag density heatmap" doc: | Tag density heatmap around centers - of differentially bound sites in + of differentially accessible regions in PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Overall' - Caption: 'Tag density heatmap around centers of diff. bound sites' + tab: "Overall" + Caption: "Tag density heatmap around centers of diff. accessible regions" pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder - label: "Plots in PDF format" + label: "Compressed folder with all PDF plots" doc: | - Compressed folder with plots - in PDF format + Compressed folder with all PDF plots. sc_atac_dbinding_stdout_log: type: File outputSource: sc_atac_dbinding/stdout_log - label: "stdout log generated by sc_atac_dbinding step" + label: "Output log" doc: | - stdout log generated by sc_atac_dbinding step + Stdout log from the sc_atac_dbinding step. sc_atac_dbinding_stderr_log: type: File outputSource: sc_atac_dbinding/stderr_log - label: "stderr log generated by sc_atac_dbinding step" + label: "Error log" doc: | - stderr log generated by sc_atac_dbinding step + Stderr log from the sc_atac_dbinding step. steps: @@ -929,9 +927,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-Cell ATAC-Seq Differential Binding Analysis" -s:name: "Single-Cell ATAC-Seq Differential Binding Analysis" -s:alternateName: "Identifies differentially bound sites between any two groups of cells" +label: "Single-Cell ATAC-Seq Differential Accessibility Analysis" +s:name: "Single-Cell ATAC-Seq Differential Accessibility Analysis" +s:alternateName: "Identifies differentially accessible regions between two groups of cells" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-atac-dbinding.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -969,8 +967,8 @@ s:creator: doc: | - Single-Cell ATAC-Seq Differential Binding Analysis + Single-Cell ATAC-Seq Differential Accessibility Analysis - Identifies differentially bound sites between any two - groups of cells, optionally aggregating chromatin + Identifies differentially accessible regions between any + two groups of cells, optionally aggregating chromatin accessibility data from single-cell to pseudobulk form. \ No newline at end of file diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index fe8ca1d8..a506a983 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -16,7 +16,7 @@ requirements: return (splitted_line && !!splitted_line.length)?splitted_line:null; }; -'sd:upstream': +"sd:upstream": sc_tools_sample: - "sc-rna-cluster.cwl" - "sc-rna-reduce.cwl" @@ -39,8 +39,8 @@ inputs: multiome ATAC and RNA-Seq or just ATAC-Seq datasets filtered by QC metrics to include only high-quality cells. - 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" - 'sd:localLabel': true + "sd:upstreamSource": "sc_tools_sample/seurat_data_rds" + "sd:localLabel": true normalization_method: type: @@ -168,7 +168,7 @@ inputs: used for datasets integration, scaling and dimensionality reduction. Default: 0 (use all available peaks) - 'sd:layout': + "sd:layout": advanced: true export_ucsc_cb: @@ -178,7 +178,7 @@ inputs: doc: | Export results into UCSC Cell Browser Default: false - 'sd:layout': + "sd:layout": advanced: true color_theme: @@ -214,13 +214,13 @@ inputs: - "4" - "5" - "6" - default: "1" + default: "6" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 1 + Default: 6 "sd:layout": advanced: true @@ -234,10 +234,10 @@ outputs: doc: | Correlation between QC metrics and LSI components - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'QC' - Caption: 'Correlation between QC metrics and LSI components' + tab: "QC" + Caption: "Correlation between QC metrics and LSI components" umap_qc_mtrcs_plot_png: type: File? @@ -245,10 +245,10 @@ outputs: label: "UMAP, QC metrics" doc: | UMAP, QC metrics - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'QC' - Caption: 'UMAP, QC metrics' + tab: "QC" + Caption: "UMAP, QC metrics" umap_plot_png: type: File? @@ -256,10 +256,10 @@ outputs: label: "UMAP, colored by dataset" doc: | UMAP, colored by dataset - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by dataset' + tab: "Per dataset" + Caption: "UMAP, colored by dataset" umap_spl_idnt_plot_png: type: File? @@ -267,10 +267,10 @@ outputs: label: "UMAP, split by dataset" doc: | UMAP, split by dataset - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, split by dataset' + tab: "Per dataset" + Caption: "UMAP, split by dataset" umap_spl_frgm_plot_png: type: File? @@ -279,10 +279,10 @@ outputs: doc: | UMAP, colored by dataset, split by ATAC fragments in peaks per cell. - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by dataset, split by ATAC fragments in peaks per cell' + tab: "Per dataset" + Caption: "UMAP, colored by dataset, split by ATAC fragments in peaks per cell" umap_spl_peak_plot_png: type: File? @@ -291,10 +291,10 @@ outputs: doc: | UMAP, colored by dataset, split by peaks per cell - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by dataset, split by peaks per cell' + tab: "Per dataset" + Caption: "UMAP, colored by dataset, split by peaks per cell" umap_spl_tss_plot_png: type: File? @@ -303,10 +303,10 @@ outputs: doc: | UMAP, colored by dataset, split by TSS enrichment score - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by dataset, split by TSS enrichment score' + tab: "Per dataset" + Caption: "UMAP, colored by dataset, split by TSS enrichment score" umap_spl_ncls_plot_png: type: File? @@ -315,10 +315,10 @@ outputs: doc: | UMAP, colored by dataset, split by nucleosome signal - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by dataset, split by nucleosome signal' + tab: "Per dataset" + Caption: "UMAP, colored by dataset, split by nucleosome signal" umap_spl_frip_plot_png: type: File? @@ -327,10 +327,10 @@ outputs: doc: | UMAP, colored by dataset, split by FRiP - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by dataset, split by FRiP' + tab: "Per dataset" + Caption: "UMAP, colored by dataset, split by FRiP" umap_spl_blck_plot_png: type: File? @@ -339,10 +339,10 @@ outputs: doc: | UMAP, colored by dataset, split by blacklist fraction - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by dataset, split by blacklist fraction' + tab: "Per dataset" + Caption: "UMAP, colored by dataset, split by blacklist fraction" umap_spl_cnd_plot_png: type: File? @@ -351,10 +351,10 @@ outputs: doc: | UMAP, colored by dataset, split by grouping condition - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by dataset, split by grouping condition' + tab: "Per group" + Caption: "UMAP, colored by dataset, split by grouping condition" umap_gr_cnd_spl_frgm_plot_png: type: File? @@ -363,10 +363,10 @@ outputs: doc: | UMAP, colored by grouping condition, split by ATAC fragments in peaks per cell - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by grouping condition, split by ATAC fragments in peaks per cell' + tab: "Per group" + Caption: "UMAP, colored by grouping condition, split by ATAC fragments in peaks per cell" umap_gr_cnd_spl_peak_plot_png: type: File? @@ -375,10 +375,10 @@ outputs: doc: | UMAP, colored by grouping condition, split by peaks per cell - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by grouping condition, split by peaks per cell' + tab: "Per group" + Caption: "UMAP, colored by grouping condition, split by peaks per cell" umap_gr_cnd_spl_tss_plot_png: type: File? @@ -387,10 +387,10 @@ outputs: doc: | UMAP, colored by grouping condition, split by TSS enrichment score - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by grouping condition, split by TSS enrichment score' + tab: "Per group" + Caption: "UMAP, colored by grouping condition, split by TSS enrichment score" umap_gr_cnd_spl_ncls_plot_png: type: File? @@ -399,10 +399,10 @@ outputs: doc: | UMAP, colored by grouping condition, split by nucleosome signal - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by grouping condition, split by nucleosome signal' + tab: "Per group" + Caption: "UMAP, colored by grouping condition, split by nucleosome signal" umap_gr_cnd_spl_frip_plot_png: type: File? @@ -411,10 +411,10 @@ outputs: doc: | UMAP, colored by grouping condition, split by FRiP - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by grouping condition, split by FRiP' + tab: "Per group" + Caption: "UMAP, colored by grouping condition, split by FRiP" umap_gr_cnd_spl_blck_plot_png: type: File? @@ -423,25 +423,24 @@ outputs: doc: | UMAP, colored by grouping condition, split by blacklist fraction - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by grouping condition, split by blacklist fraction' + tab: "Per group" + Caption: "UMAP, colored by grouping condition, split by blacklist fraction" ucsc_cb_html_data: type: Directory? outputSource: sc_atac_reduce/ucsc_cb_html_data - label: "UCSC Cell Browser data" + label: "UCSC Cell Browser (data)" doc: | - Directory with UCSC Cell Browser - data + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputSource: sc_atac_reduce/ucsc_cb_html_file label: "UCSC Cell Browser" doc: | - UCSC Cell Browser HTML index file + UCSC Cell Browser html index. "sd:visualPlugins": - linkList: tab: "Overview" @@ -450,31 +449,31 @@ outputs: seurat_data_rds: type: File outputSource: sc_atac_reduce/seurat_data_rds - label: "Processed Seurat data in RDS format" + label: "Seurat object in RDS format" doc: | - Processed Seurat data in RDS format + Seurat object. + RDS format. pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder - label: "Plots in PDF format" + label: "Compressed folder with all PDF plots" doc: | - Compressed folder with plots - in PDF format + Compressed folder with all PDF plots. sc_atac_reduce_stdout_log: type: File outputSource: sc_atac_reduce/stdout_log - label: "stdout log generated by sc_atac_reduce step" + label: "Output log" doc: | - stdout log generated by sc_atac_reduce step + Stdout log from the sc_atac_reduce step. sc_atac_reduce_stderr_log: type: File outputSource: sc_atac_reduce/stderr_log - label: "stderr log generated by sc_atac_reduce step" + label: "Error log" doc: | - stderr log generated by sc_atac_reduce step + Stderr log from the sc_atac_reduce step. steps: diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index 88095fe1..1ceeff57 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -26,7 +26,7 @@ requirements: }; -'sd:upstream': +"sd:upstream": sc_tools_sample: - "sc-rna-cluster.cwl" - "sc-atac-cluster.cwl" @@ -57,8 +57,8 @@ inputs: "Single-Cell ATAC-Seq Cluster Analysis", "Single-Cell WNN Cluster Analysis", - at any of the processing stages. - 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" - 'sd:localLabel': true + "sd:upstreamSource": "sc_tools_sample/seurat_data_rds" + "sd:localLabel": true atac_fragments_file: type: File? @@ -74,8 +74,8 @@ inputs: (RNA+ATAC)", "Cell Ranger Aggregate (RNA+ATAC)", "Cell Ranger Count (ATAC)", or "Cell Ranger Aggregate (ATAC)". - 'sd:upstreamSource': "sc_atac_sample/atac_fragments_file" - 'sd:localLabel': true + "sd:upstreamSource": "sc_atac_sample/atac_fragments_file" + "sd:localLabel": true query_reduction: type: @@ -209,314 +209,272 @@ inputs: - "4" - "5" - "6" - default: "1" + default: "6" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 1 + Default: 6 "sd:layout": advanced: true outputs: - umap_rd_rnaumap_plot_png: + umap_gr_ctyp_plot_png: type: File? - outputSource: ctype_assign/umap_rd_rnaumap_plot_png - label: "UMAP, colored by cell type, RNA" + outputSource: ctype_assign/umap_gr_ctyp_plot_png + label: "UMAP colored by cell type (all cells)" doc: | - UMAP, colored by cell type, RNA - 'sd:visualPlugins': - - image: - tab: 'Per cell type' - Caption: 'UMAP, colored by cell type, RNA' - - umap_rd_atacumap_plot_png: - type: File? - outputSource: ctype_assign/umap_rd_atacumap_plot_png - label: "UMAP, colored by cell type, ATAC" - doc: | - UMAP, colored by cell type, ATAC - 'sd:visualPlugins': - - image: - tab: 'Per cell type' - Caption: 'UMAP, colored by cell type, ATAC' - - umap_rd_wnnumap_plot_png: - type: File? - outputSource: ctype_assign/umap_rd_wnnumap_plot_png - label: "UMAP, colored by cell type, WNN" - doc: | - UMAP, colored by cell type, WNN - 'sd:visualPlugins': - - image: - tab: 'Per cell type' - Caption: 'UMAP, colored by cell type, WNN' - - umap_spl_ph_rd_rnaumap_plot_png: - type: File? - outputSource: ctype_assign/umap_spl_ph_rd_rnaumap_plot_png - label: "UMAP, colored by cell type, split by cell cycle phase, RNA" - doc: | - UMAP, colored by cell type, split - by cell cycle phase, RNA - 'sd:visualPlugins': - - image: - tab: 'Per cell type' - Caption: 'UMAP, colored by cell type, split by cell cycle phase, RNA' - - umap_spl_ph_rd_atacumap_plot_png: - type: File? - outputSource: ctype_assign/umap_spl_ph_rd_atacumap_plot_png - label: "UMAP, colored by cell type, split by cell cycle phase, ATAC" - doc: | - UMAP, colored by cell type, split - by cell cycle phase, ATAC - 'sd:visualPlugins': - - image: - tab: 'Per cell type' - Caption: 'UMAP, colored by cell type, split by cell cycle phase, ATAC' - - umap_spl_ph_rd_wnnumap_plot_png: - type: File? - outputSource: ctype_assign/umap_spl_ph_rd_wnnumap_plot_png - label: "UMAP, colored by cell type, split by cell cycle phase, WNN" - doc: | - UMAP, colored by cell type, split - by cell cycle phase, WNN - 'sd:visualPlugins': - - image: - tab: 'Per cell type' - Caption: 'UMAP, colored by cell type, split by cell cycle phase, WNN' - - cmp_gr_ph_spl_ctyp_plot_png: - type: File? - outputSource: ctype_assign/cmp_gr_ph_spl_ctyp_plot_png - label: "Composition plot, colored by cell cycle phase, split by cell type, downsampled" - doc: | - Composition plot, colored by cell - cycle phase, split by cell type, - downsampled - 'sd:visualPlugins': - - image: - tab: 'Per dataset' - Caption: 'Composition plot, colored by cell cycle phase, split by cell type, downsampled' - - umap_spl_idnt_rd_rnaumap_plot_png: - type: File? - outputSource: ctype_assign/umap_spl_idnt_rd_rnaumap_plot_png - label: "UMAP, colored by cell type, split by dataset, RNA" - doc: | - UMAP, colored by cell type, - split by dataset, RNA - 'sd:visualPlugins': - - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by cell type, split by dataset, RNA' - - umap_spl_idnt_rd_atacumap_plot_png: - type: File? - outputSource: ctype_assign/umap_spl_idnt_rd_atacumap_plot_png - label: "UMAP, colored by cell type, split by dataset, ATAC" - doc: | - UMAP, colored by cell type, - split by dataset, ATAC - 'sd:visualPlugins': + UMAP colored by cell type. + All cells. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by cell type, split by dataset, ATAC' + tab: "Per cell type" + Caption: "UMAP colored by cell type (all cells)" - umap_spl_idnt_rd_wnnumap_plot_png: + umap_gr_ctyp_spl_idnt_plot_png: type: File? - outputSource: ctype_assign/umap_spl_idnt_rd_wnnumap_plot_png - label: "UMAP, colored by cell type, split by dataset, WNN" + outputSource: ctype_assign/umap_gr_ctyp_spl_idnt_plot_png + label: "UMAP colored by cell type (split by dataset, downsampled)" doc: | - UMAP, colored by cell type, - split by dataset, WNN - 'sd:visualPlugins': + UMAP colored by cell type. + Split by dataset; downsampled to the + smallest dataset. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by cell type, split by dataset, WNN' + tab: "Per dataset" + Caption: "UMAP colored by cell type (split by dataset, downsampled)" cmp_gr_ctyp_spl_idnt_plot_png: type: File? outputSource: ctype_assign/cmp_gr_ctyp_spl_idnt_plot_png - label: "Composition plot, colored by cell type, split by dataset, downsampled" + label: "Composition plot colored by cell type (split by dataset, downsampled)" doc: | - Composition plot, colored by cell - type, split by dataset, downsampled - 'sd:visualPlugins': + Composition plot colored by cell type. + Split by dataset; downsampled to the + smallest dataset. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Composition plot, colored by cell type, split by dataset, downsampled' + tab: "Per dataset" + Caption: "Composition plot colored by cell type (split by dataset, downsampled)" cmp_gr_idnt_spl_ctyp_plot_png: type: File? outputSource: ctype_assign/cmp_gr_idnt_spl_ctyp_plot_png - label: "Composition plot, colored by dataset, split by cell type, downsampled" + label: "Composition plot colored by dataset (split by cell type, downsampled)" + doc: | + Composition plot colored by dataset. + Split by cell type; downsampled to + the smallest dataset. + PNG format. + "sd:visualPlugins": + - image: + tab: "Per dataset" + Caption: "Composition plot colored by dataset (split by cell type, downsampled)" + + umap_gr_ph_spl_idnt_plot_png: + type: File? + outputSource: ctype_assign/umap_gr_ph_spl_idnt_plot_png + label: "UMAP colored by cell cycle phase (split by dataset, downsampled)" doc: | - Composition plot, colored by - dataset, split by cell type, - downsampled - 'sd:visualPlugins': + UMAP colored by cell cycle phase. + Split by dataset; downsampled to the + smallest dataset. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Composition plot, colored by dataset, split by cell type, downsampled' + tab: "Per dataset" + Caption: "UMAP colored by cell cycle phase (split by dataset, downsampled)" cmp_gr_ph_spl_idnt_plot_png: type: File? outputSource: ctype_assign/cmp_gr_ph_spl_idnt_plot_png - label: "Composition plot, colored by cell cycle phase, split by dataset, downsampled" + label: "Composition plot colored by cell cycle phase (split by dataset, downsampled)" doc: | - Composition plot, colored by - cell cycle phase, split by - dataset, downsampled - 'sd:visualPlugins': + Composition plot colored by cell cycle phase. + Split by dataset; downsampled to the smallest + dataset. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Composition plot, colored by cell cycle phase, split by dataset, downsampled' + tab: "Per dataset" + Caption: "Composition plot colored by cell cycle phase (split by dataset, downsampled)" - umap_spl_cnd_rd_rnaumap_plot_png: + umap_gr_ctyp_spl_ph_png: type: File? - outputSource: ctype_assign/umap_spl_cnd_rd_rnaumap_plot_png - label: "UMAP, colored by cell type, split by grouping condition, RNA" - doc: | - UMAP, colored by cell type, split - by grouping condition, RNA - 'sd:visualPlugins': + outputSource: ctype_assign/umap_gr_ctyp_spl_ph_png + label: "UMAP colored by cell type (split by cell cycle phase, optionally downsampled)" + doc: | + UMAP colored by cell type. + Split by cell cycle phase; downsampled + to the smallest dataset (if multiple + datasets are analyzed jointly). + PNG format. + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by cell type, split by grouping condition, RNA' + tab: "Per cell type" + Caption: "UMAP colored by cell type (split by cell cycle phase, optionally downsampled)" - umap_spl_cnd_rd_atacumap_plot_png: + cmp_gr_ph_spl_ctyp_png: type: File? - outputSource: ctype_assign/umap_spl_cnd_rd_atacumap_plot_png - label: "UMAP, colored by cell type, split by grouping condition, ATAC" - doc: | - UMAP, colored by cell type, split - by grouping condition, ATAC - 'sd:visualPlugins': + outputSource: ctype_assign/cmp_gr_ph_spl_ctyp_png + label: "Composition plot colored by cell cycle phase (split by cell type, optionally downsampled)" + doc: | + Composition plot colored by cell cycle phase. + Split by cell type; downsampled to the + smallest dataset (if multiple datasets are + analyzed jointly). + PNG format. + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by cell type, split by grouping condition, ATAC' + tab: "Per cell type" + Caption: "Composition plot colored by cell cycle phase (split by cell type, optionally downsampled)" - umap_spl_cnd_rd_wnnumap_plot_png: + umap_gr_ctyp_spl_cnd_plot_png: type: File? - outputSource: ctype_assign/umap_spl_cnd_rd_wnnumap_plot_png - label: "UMAP, colored by cell type, split by grouping condition, WNN" - doc: | - UMAP, colored by cell type, split - by grouping condition, WNN - 'sd:visualPlugins': + outputSource: ctype_assign/umap_gr_ctyp_spl_cnd_plot_png + label: "UMAP colored by cell type (split by grouping condition, downsampled)" + doc: | + UMAP colored by cell type. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by cell type, split by grouping condition, WNN' + tab: "Per group" + Caption: "UMAP colored by cell type (split by grouping condition, downsampled)" cmp_gr_ctyp_spl_cnd_plot_png: type: File? outputSource: ctype_assign/cmp_gr_ctyp_spl_cnd_plot_png - label: "Composition plot, colored by cell type, split by grouping condition, downsampled" + label: "Composition plot colored by cell type (split by grouping condition, downsampled)" doc: | - Composition plot, colored by cell - type, split by grouping condition, - downsampled - 'sd:visualPlugins': + Composition plot colored by cell type. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'Composition plot, colored by cell type, split by grouping condition, downsampled' + tab: "Per group" + Caption: "Composition plot colored by cell type (split by grouping condition, downsampled)" cmp_gr_cnd_spl_ctyp_plot_png: type: File? outputSource: ctype_assign/cmp_gr_cnd_spl_ctyp_plot_png - label: "Composition plot, colored by grouping condition, split by cell type, downsampled" + label: "Composition plot colored by grouping condition (split by cell type, downsampled)" doc: | - Composition plot, colored by - grouping condition, split by - cell type, downsampled - 'sd:visualPlugins': + Composition plot colored by grouping condition. + Split by cell type; first downsampled to the + smallest dataset, then downsampled to the + smallest group. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'Composition plot, colored by grouping condition, split by cell type, downsampled' + tab: "Per group" + Caption: "Composition plot colored by grouping condition (split by cell type, downsampled)" - xpr_avg_plot_png: + umap_gr_ph_spl_cnd_plot_png: type: File? - outputSource: ctype_assign/xpr_avg_plot_png - label: "Gene expression dot plot" - doc: | - Gene expression dot plot - 'sd:visualPlugins': + outputSource: ctype_assign/umap_gr_ph_spl_cnd_plot_png + label: "UMAP colored by cell cycle phase (split by grouping condition, downsampled)" + doc: | + UMAP colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": - image: - tab: 'Genes of interest' - Caption: 'Gene expression dot plot' + tab: "Per group" + Caption: "UMAP colored by cell cycle phase (split by grouping condition, downsampled)" - xpr_dnst_plot_png: - type: - - "null" - - type: array - items: File - outputSource: ctype_assign/xpr_dnst_plot_png - label: "Gene expression violin plot" - doc: | - Gene expression violin plot - 'sd:visualPlugins': + cmp_gr_ph_spl_cnd_plot_png: + type: File? + outputSource: ctype_assign/cmp_gr_ph_spl_cnd_plot_png + label: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" + doc: | + Composition plot colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": - image: - tab: 'Genes of interest' - Caption: 'Gene expression violin plot' + tab: "Per group" + Caption: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" - xpr_per_cell_rd_rnaumap_plot_png: - type: - - "null" - - type: array - items: File - outputSource: ctype_assign/xpr_per_cell_rd_rnaumap_plot_png - label: "UMAP, gene expression, RNA" + xpr_avg_plot_png: + type: File? + outputSource: ctype_assign/xpr_avg_plot_png + label: "Average gene expression" doc: | - UMAP, gene expression, RNA - 'sd:visualPlugins': + Average gene expression. + PNG format. + "sd:visualPlugins": - image: - tab: 'Genes of interest' - Caption: 'UMAP, gene expression, RNA' + tab: "Gene expression" + Caption: "Average gene expression" - xpr_per_cell_rd_atacumap_plot_png: + xpr_per_cell_plot_png: type: - "null" - type: array items: File - outputSource: ctype_assign/xpr_per_cell_rd_atacumap_plot_png - label: "UMAP, gene expression, ATAC" + outputSource: ctype_assign/xpr_per_cell_plot_png + label: "UMAP colored by gene expression (per gene)" doc: | - UMAP, gene expression, ATAC - 'sd:visualPlugins': + UMAP colored by gene expression. + All genes of interest. + PNG format. + "sd:visualPlugins": - image: - tab: 'Genes of interest' - Caption: 'UMAP, gene expression, ATAC' + tab: "Gene expression" + Caption: "UMAP colored by gene expression (per gene)" - xpr_per_cell_rd_wnnumap_plot_png: + xpr_dnst_plot_png: type: - "null" - type: array items: File - outputSource: ctype_assign/xpr_per_cell_rd_wnnumap_plot_png - label: "UMAP, gene expression, WNN" + outputSource: ctype_assign/xpr_dnst_plot_png + label: "Gene expression density (per gene)" doc: | - UMAP, gene expression, WNN - 'sd:visualPlugins': + Gene expression density. + All genes of interest. + PNG format. + "sd:visualPlugins": - image: - tab: 'Genes of interest' - Caption: 'UMAP, gene expression, WNN' + tab: "Gene expression" + Caption: "Gene expression density (per gene)" xpr_htmp_plot_png: type: File? outputSource: ctype_assign/xpr_htmp_plot_png - label: "Gene expression heatmap" + label: "Gene expression heatmap (top gene markers)" doc: | - Gene expression heatmap - 'sd:visualPlugins': + Gene expression heatmap. + Top gene markers. + PNG format. + "sd:visualPlugins": - image: - tab: 'Heatmap' - Caption: 'Gene expression heatmap' + tab: "Gene expression heatmap" + Caption: "Gene expression heatmap (top gene markers)" + + xpr_htmp_tsv: + type: File? + outputSource: ctype_assign/xpr_htmp_tsv + label: "Gene expression heatmap (top gene markers)" + doc: | + Gene expression heatmap. + Top gene markers. + TSV format. cvrg_plot_png: type: @@ -524,58 +482,53 @@ outputs: - type: array items: File outputSource: ctype_assign/cvrg_plot_png - label: "ATAC fragments coverage" + label: "ATAC fragment coverage (per gene)" doc: | - ATAC fragments coverage - 'sd:visualPlugins': + ATAC fragment coverage. + All genes of interest. + PNG format. + "sd:visualPlugins": - image: - tab: 'Genome coverage' - Caption: 'ATAC fragments coverage' - - xpr_htmp_tsv: - type: File? - outputSource: ctype_assign/xpr_htmp_tsv - label: "Markers from gene expression heatmap" - doc: | - Gene markers used for gene - expression heatmap + tab: "Genome coverage" + Caption: "ATAC fragment coverage (per gene)" gene_markers_tsv: type: File? outputSource: ctype_assign/gene_markers_tsv - label: "Gene markers per cell type" + label: "Gene markers" doc: | - Gene markers per cell type - 'sd:visualPlugins': + Gene markers. + TSV format. + "sd:visualPlugins": - syncfusiongrid: - tab: 'Gene markers' - Title: 'Gene markers per cell type' + tab: "Gene markers" + Title: "Gene markers" peak_markers_tsv: type: File? outputSource: ctype_assign/peak_markers_tsv - label: "Peak markers per cell type" + label: "Peak markers" doc: | - Peak markers per cell type - 'sd:visualPlugins': + Peak markers. + TSV format. + "sd:visualPlugins": - syncfusiongrid: - tab: 'Peak markers' - Title: 'Peak markers per cell type' + tab: "Peak markers" + Title: "Peak markers" ucsc_cb_html_data: type: Directory? outputSource: ctype_assign/ucsc_cb_html_data - label: "UCSC Cell Browser data" + label: "UCSC Cell Browser (data)" doc: | - Directory with UCSC Cell Browser - data + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputSource: ctype_assign/ucsc_cb_html_file label: "UCSC Cell Browser" doc: | - UCSC Cell Browser HTML index file + UCSC Cell Browser html index. "sd:visualPlugins": - linkList: tab: "Overview" @@ -584,38 +537,40 @@ outputs: seurat_data_rds: type: File outputSource: ctype_assign/seurat_data_rds - label: "Processed Seurat data in RDS format" + label: "Seurat object in RDS format" doc: | - Processed Seurat data in RDS format + Seurat object. + RDS format. seurat_data_scope: type: File? outputSource: ctype_assign/seurat_data_scope - label: "Processed Seurat data in SCope compatible loom format" + label: "Seurat object in SCope compatible loom format" doc: | - Processed Seurat data in SCope compatible loom format + Seurat object. + SCope compatible. + Loom format. pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder - label: "Plots in PDF format" + label: "Compressed folder with all PDF plots" doc: | - Compressed folder with plots - in PDF format + Compressed folder with all PDF plots. ctype_assign_stdout_log: type: File outputSource: ctype_assign/stdout_log - label: "stdout log generated by ctype_assign step" + label: "Output log" doc: | - stdout log generated by ctype_assign step + Stdout log from the ctype_assign step. ctype_assign_stderr_log: type: File outputSource: ctype_assign/stderr_log - label: "stderr log generated by ctype_assign step" + label: "Error log" doc: | - stderr log generated by ctype_assign step + Stderr log from the ctype_assign step. steps: @@ -643,6 +598,18 @@ steps: return null; } } + reduction: + source: query_reduction + valueFrom: | + ${ + if (self == "RNA") { + return "rnaumap"; + } else if (self == "ATAC") { + return "atacumap"; + } else { + return "wnnumap"; + } + } atac_fragments_file: atac_fragments_file genes_of_interest: source: genes_of_interest @@ -680,59 +647,43 @@ steps: source: threads valueFrom: $(parseInt(self)) out: - - umap_rd_rnaumap_plot_png - - umap_rd_atacumap_plot_png - - umap_rd_wnnumap_plot_png - - umap_spl_idnt_rd_rnaumap_plot_png - - umap_spl_idnt_rd_atacumap_plot_png - - umap_spl_idnt_rd_wnnumap_plot_png - - umap_spl_cnd_rd_rnaumap_plot_png - - umap_spl_cnd_rd_atacumap_plot_png - - umap_spl_cnd_rd_wnnumap_plot_png - - umap_spl_ph_rd_rnaumap_plot_png - - umap_spl_ph_rd_atacumap_plot_png - - umap_spl_ph_rd_wnnumap_plot_png + - umap_gr_ctyp_plot_png + - umap_gr_ctyp_spl_idnt_plot_png - cmp_gr_ctyp_spl_idnt_plot_png - cmp_gr_idnt_spl_ctyp_plot_png + - umap_gr_ph_spl_idnt_plot_png - cmp_gr_ph_spl_idnt_plot_png + - umap_gr_ctyp_spl_ph_png + - cmp_gr_ph_spl_ctyp_png + - umap_gr_ctyp_spl_cnd_plot_png - cmp_gr_ctyp_spl_cnd_plot_png - cmp_gr_cnd_spl_ctyp_plot_png - - cmp_gr_ph_spl_ctyp_plot_png + - umap_gr_ph_spl_cnd_plot_png + - cmp_gr_ph_spl_cnd_plot_png - xpr_avg_plot_png + - xpr_per_cell_plot_png - xpr_dnst_plot_png - - xpr_per_cell_rd_rnaumap_plot_png - - xpr_per_cell_rd_atacumap_plot_png - - xpr_per_cell_rd_wnnumap_plot_png - - cvrg_plot_png - xpr_htmp_plot_png - - umap_rd_rnaumap_plot_pdf - - umap_rd_atacumap_plot_pdf - - umap_rd_wnnumap_plot_pdf - - umap_spl_idnt_rd_rnaumap_plot_pdf - - umap_spl_idnt_rd_atacumap_plot_pdf - - umap_spl_idnt_rd_wnnumap_plot_pdf - - umap_spl_cnd_rd_rnaumap_plot_pdf - - umap_spl_cnd_rd_atacumap_plot_pdf - - umap_spl_cnd_rd_wnnumap_plot_pdf - - umap_spl_ph_rd_rnaumap_plot_pdf - - umap_spl_ph_rd_atacumap_plot_pdf - - umap_spl_ph_rd_wnnumap_plot_pdf + - cvrg_plot_png + - umap_gr_ctyp_plot_pdf + - umap_gr_ctyp_spl_idnt_plot_pdf - cmp_gr_ctyp_spl_idnt_plot_pdf - cmp_gr_idnt_spl_ctyp_plot_pdf + - umap_gr_ph_spl_idnt_plot_pdf - cmp_gr_ph_spl_idnt_plot_pdf + - umap_gr_ctyp_spl_ph_plot_pdf + - cmp_gr_ph_spl_ctyp_plot_pdf + - umap_gr_ctyp_spl_cnd_plot_pdf - cmp_gr_ctyp_spl_cnd_plot_pdf - cmp_gr_cnd_spl_ctyp_plot_pdf - - cmp_gr_ph_spl_ctyp_plot_pdf + - umap_gr_ph_spl_cnd_plot_pdf + - cmp_gr_ph_spl_cnd_plot_pdf - xpr_avg_plot_pdf + - xpr_per_cell_plot_pdf + - xpr_per_cell_sgnl_plot_pdf - xpr_dnst_plot_pdf - - xpr_per_cell_rd_rnaumap_plot_pdf - - xpr_per_cell_rd_atacumap_plot_pdf - - xpr_per_cell_rd_wnnumap_plot_pdf - - xpr_per_cell_sgnl_rd_rnaumap_plot_pdf - - xpr_per_cell_sgnl_rd_atacumap_plot_pdf - - xpr_per_cell_sgnl_rd_wnnumap_plot_pdf - - cvrg_plot_pdf - xpr_htmp_plot_pdf + - cvrg_plot_pdf - xpr_htmp_tsv - gene_markers_tsv - peak_markers_tsv @@ -748,34 +699,25 @@ steps: in: input_files: source: - - ctype_assign/umap_rd_rnaumap_plot_pdf - - ctype_assign/umap_rd_atacumap_plot_pdf - - ctype_assign/umap_rd_wnnumap_plot_pdf - - ctype_assign/umap_spl_idnt_rd_rnaumap_plot_pdf - - ctype_assign/umap_spl_idnt_rd_atacumap_plot_pdf - - ctype_assign/umap_spl_idnt_rd_wnnumap_plot_pdf - - ctype_assign/umap_spl_cnd_rd_rnaumap_plot_pdf - - ctype_assign/umap_spl_cnd_rd_atacumap_plot_pdf - - ctype_assign/umap_spl_cnd_rd_wnnumap_plot_pdf - - ctype_assign/umap_spl_ph_rd_rnaumap_plot_pdf - - ctype_assign/umap_spl_ph_rd_atacumap_plot_pdf - - ctype_assign/umap_spl_ph_rd_wnnumap_plot_pdf + - ctype_assign/umap_gr_ctyp_plot_pdf + - ctype_assign/umap_gr_ctyp_spl_idnt_plot_pdf - ctype_assign/cmp_gr_ctyp_spl_idnt_plot_pdf - ctype_assign/cmp_gr_idnt_spl_ctyp_plot_pdf + - ctype_assign/umap_gr_ph_spl_idnt_plot_pdf - ctype_assign/cmp_gr_ph_spl_idnt_plot_pdf + - ctype_assign/umap_gr_ctyp_spl_ph_plot_pdf + - ctype_assign/cmp_gr_ph_spl_ctyp_plot_pdf + - ctype_assign/umap_gr_ctyp_spl_cnd_plot_pdf - ctype_assign/cmp_gr_ctyp_spl_cnd_plot_pdf - ctype_assign/cmp_gr_cnd_spl_ctyp_plot_pdf - - ctype_assign/cmp_gr_ph_spl_ctyp_plot_pdf + - ctype_assign/umap_gr_ph_spl_cnd_plot_pdf + - ctype_assign/cmp_gr_ph_spl_cnd_plot_pdf - ctype_assign/xpr_avg_plot_pdf + - ctype_assign/xpr_per_cell_plot_pdf + - ctype_assign/xpr_per_cell_sgnl_plot_pdf - ctype_assign/xpr_dnst_plot_pdf - - ctype_assign/xpr_per_cell_rd_rnaumap_plot_pdf - - ctype_assign/xpr_per_cell_rd_atacumap_plot_pdf - - ctype_assign/xpr_per_cell_rd_wnnumap_plot_pdf - - ctype_assign/xpr_per_cell_sgnl_rd_rnaumap_plot_pdf - - ctype_assign/xpr_per_cell_sgnl_rd_atacumap_plot_pdf - - ctype_assign/xpr_per_cell_sgnl_rd_wnnumap_plot_pdf - - ctype_assign/cvrg_plot_pdf - ctype_assign/xpr_htmp_plot_pdf + - ctype_assign/cvrg_plot_pdf valueFrom: $(self.flat().filter(n => n)) folder_basename: default: "pdf_plots" diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index 3bdb8f9c..ebf0acac 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -123,7 +123,7 @@ inputs: the "Selected cell barcodes (optional)" input. Default: use the original peaks generated by Cell Ranger RNA+ATAC Sample. - 'sd:layout': + "sd:layout": advanced: true minimum_qvalue: @@ -135,7 +135,7 @@ inputs: detection. Ignored if "Cells grouping for MACS2 peak calling" input is not provided. Default: 0.05 - 'sd:layout': + "sd:layout": advanced: true remove_doublets: @@ -181,7 +181,9 @@ inputs: "aggregation_metadata.csv" output generated by "Cell Ranger RNA+ATAC Sample" and accessible on the "Files" - tab. + tab. Any 0 will be replaced with the + auto-estimated threshold (median - + - 2.5 * MAD) calculated per dataset. Default: 500 "sd:layout": advanced: true @@ -206,7 +208,9 @@ inputs: "aggregation_metadata.csv" output generated by "Cell Ranger RNA+ATAC Sample" and accessible on the "Files" - tab. + tab. Any 0 will be replaced with the + auto-estimated threshold (median - + - 2.5 * MAD) calculated per dataset. Default: 250 "sd:layout": advanced: true @@ -231,7 +235,9 @@ inputs: "aggregation_metadata.csv" output generated by "Cell Ranger RNA+ATAC Sample" and accessible on the "Files" - tab. + tab. Any 0 will be replaced with the + auto-estimated threshold (median + + + 5 * MAD) calculated per dataset. Default: 5000 "sd:layout": advanced: true @@ -257,6 +263,10 @@ inputs: cells with the percentage of RNA reads mapped to mitochondrial genes exceeding the provided value. + Set to 0 for using an auto-estimated + threshold equal to the maximum among + (median + 2 * MAD) values calculated + per dataset. Default: 5 "sd:layout": advanced: true @@ -309,7 +319,9 @@ inputs: "aggregation_metadata.csv" output generated by "Cell Ranger RNA+ATAC Sample" and accessible on the "Files" - tab. + tab. Any 0 will be replaced with the + auto-estimated threshold (median - + - 2.5 * MAD) calculated per dataset. Default: 1000 "sd:layout": advanced: true @@ -457,13 +469,13 @@ inputs: - "4" - "5" - "6" - default: "1" + default: "6" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 1 + Default: 6 "sd:layout": advanced: true @@ -542,6 +554,18 @@ outputs: tab: "Raw" Caption: "Genes vs RNA reads" + raw_umi_mito_plot_png: + type: File? + outputSource: sc_multiome_filter/raw_umi_mito_plot_png + label: "RNA reads vs mitochondrial %, raw" + doc: | + RNA reads vs mitochondrial % per cell + for raw data + "sd:visualPlugins": + - image: + tab: "Raw" + Caption: "RNA reads vs mitochondrial %" + raw_mito_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_mito_dnst_plot_png @@ -862,6 +886,18 @@ outputs: tab: "Filtered" Caption: "Genes vs RNA reads" + fltr_umi_mito_plot_png: + type: File? + outputSource: sc_multiome_filter/fltr_umi_mito_plot_png + label: "RNA reads vs mitochondrial %, filtered" + doc: | + RNA reads vs mitochondrial % per cell + for filtered data + "sd:visualPlugins": + - image: + tab: "Filtered" + Caption: "RNA reads vs mitochondrial %, filtered" + fltr_mito_dnst_plot_png: type: File? outputSource: sc_multiome_filter/fltr_mito_dnst_plot_png @@ -1114,17 +1150,16 @@ outputs: ucsc_cb_html_data: type: Directory outputSource: sc_multiome_filter/ucsc_cb_html_data - label: "UCSC Cell Browser data" + label: "UCSC Cell Browser (data)" doc: | - Directory with UCSC Cell Browser - data + UCSC Cell Browser html data. ucsc_cb_html_file: type: File outputSource: sc_multiome_filter/ucsc_cb_html_file label: "UCSC Cell Browser" doc: | - UCSC Cell Browser HTML index file + UCSC Cell Browser html index. "sd:visualPlugins": - linkList: tab: "Overview" @@ -1133,9 +1168,10 @@ outputs: seurat_data_rds: type: File outputSource: sc_multiome_filter/seurat_data_rds - label: "Processed seurat data in RDS format" + label: "Seurat object in RDS format" doc: | - Processed seurat data in RDS format + Seurat object. + RDS format. datasets_metadata: type: File @@ -1148,26 +1184,23 @@ outputs: pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder - label: "Plots in PDF format" + label: "Compressed folder with all PDF plots" doc: | - Compressed folder with plots - in PDF format + Compressed folder with all PDF plots. sc_multiome_filter_stdout_log: type: File outputSource: sc_multiome_filter/stdout_log - label: "Output log, filtering step" + label: "Output log" doc: | - stdout log generated by - sc_multiome_filter step + Stdout log from the sc_multiome_filter step. sc_multiome_filter_stderr_log: type: File outputSource: sc_multiome_filter/stderr_log - label: "Error log, filtering step" + label: "Error log" doc: | - stderr log generated by - sc_multiome_filter step + Stderr log from the sc_multiome_filter step. steps: @@ -1278,6 +1311,7 @@ steps: - raw_umi_dnst_plot_png - raw_gene_dnst_plot_png - raw_gene_umi_plot_png + - raw_umi_mito_plot_png - raw_mito_dnst_plot_png - raw_nvlt_dnst_plot_png - raw_frgm_dnst_plot_png @@ -1304,6 +1338,7 @@ steps: - fltr_umi_dnst_plot_png - fltr_gene_dnst_plot_png - fltr_gene_umi_plot_png + - fltr_umi_mito_plot_png - fltr_mito_dnst_plot_png - fltr_nvlt_dnst_plot_png - fltr_frgm_dnst_plot_png @@ -1330,6 +1365,7 @@ steps: - raw_umi_dnst_plot_pdf - raw_gene_dnst_plot_pdf - raw_gene_umi_plot_pdf + - raw_umi_mito_plot_pdf - raw_mito_dnst_plot_pdf - raw_nvlt_dnst_plot_pdf - raw_frgm_dnst_plot_pdf @@ -1356,6 +1392,7 @@ steps: - fltr_umi_dnst_plot_pdf - fltr_gene_dnst_plot_pdf - fltr_gene_umi_plot_pdf + - fltr_umi_mito_plot_pdf - fltr_mito_dnst_plot_pdf - fltr_nvlt_dnst_plot_pdf - fltr_frgm_dnst_plot_pdf @@ -1394,6 +1431,7 @@ steps: - sc_multiome_filter/raw_umi_dnst_plot_pdf - sc_multiome_filter/raw_gene_dnst_plot_pdf - sc_multiome_filter/raw_gene_umi_plot_pdf + - sc_multiome_filter/raw_umi_mito_plot_pdf - sc_multiome_filter/raw_mito_dnst_plot_pdf - sc_multiome_filter/raw_nvlt_dnst_plot_pdf - sc_multiome_filter/raw_frgm_dnst_plot_pdf @@ -1420,6 +1458,7 @@ steps: - sc_multiome_filter/fltr_umi_dnst_plot_pdf - sc_multiome_filter/fltr_gene_dnst_plot_pdf - sc_multiome_filter/fltr_gene_umi_plot_pdf + - sc_multiome_filter/fltr_umi_mito_plot_pdf - sc_multiome_filter/fltr_mito_dnst_plot_pdf - sc_multiome_filter/fltr_nvlt_dnst_plot_pdf - sc_multiome_filter/fltr_frgm_dnst_plot_pdf @@ -1460,9 +1499,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-Cell Multiome ATAC and RNA-Seq Filtering Analysis" -s:name: "Single-Cell Multiome ATAC and RNA-Seq Filtering Analysis" -s:alternateName: "Removes low-quality cells" +label: "Single-Cell Multiome ATAC-Seq and RNA-Seq Filtering Analysis" +s:name: "Single-Cell Multiome ATAC-Seq and RNA-Seq Filtering Analysis" +s:alternateName: "Filters single-cell multiome ATAC and RNA-Seq datasets based on the common QC metrics" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-multiome-filter.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -1500,7 +1539,7 @@ s:creator: doc: | - Single-Cell Multiome ATAC and RNA-Seq Filtering Analysis + Single-Cell Multiome ATAC-Seq and RNA-Seq Filtering Analysis Removes low-quality cells from the outputs of “Cell Ranger Count (RNA+ATAC)” and “Cell Ranger Aggregate (RNA+ATAC)” pipelines. The diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index 18560bec..c80259d6 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -21,7 +21,7 @@ requirements: }; -'sd:upstream': +"sd:upstream": sc_tools_sample: - "sc-rna-cluster.cwl" - "sc-atac-cluster.cwl" @@ -47,8 +47,8 @@ inputs: "Single-Cell RNA-Seq Dimensionality Reduction Analysis" at any of the processing stages. - 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" - 'sd:localLabel': true + "sd:upstreamSource": "sc_tools_sample/seurat_data_rds" + "sd:localLabel": true dimensions: type: int? @@ -59,7 +59,8 @@ inputs: used in constructing nearest-neighbor graph as part of the clustering algorithm. Accepted values range from - 1 to 50. + 1 to 50. Set to 0 to use auto-estimated + dimensionality. Default: 40 resolution: @@ -138,92 +139,125 @@ inputs: - "4" - "5" - "6" - default: "1" + default: "6" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 1 + Default: 6 "sd:layout": advanced: true outputs: - umap_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_rna_cluster/umap_res_plot_png - label: "UMAP, colored by cluster" + umap_gr_ph_spl_idnt_plot_png: + type: File? + outputSource: sc_rna_cluster/umap_gr_ph_spl_idnt_plot_png + label: "UMAP colored by cell cycle phase (split by dataset, downsampled)" doc: | - UMAP, colored by cluster - 'sd:visualPlugins': + UMAP colored by cell cycle phase. + Split by dataset; downsampled to the + smallest dataset. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per cluster' - Caption: 'UMAP, colored by cluster' + tab: "Per dataset" + Caption: "UMAP colored by cell cycle phase (split by dataset, downsampled)" - slh_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_rna_cluster/slh_res_plot_png - label: "Silhouette scores" + cmp_gr_ph_spl_idnt_plot_png: + type: File? + outputSource: sc_rna_cluster/cmp_gr_ph_spl_idnt_plot_png + label: "Composition plot colored by cell cycle phase (split by dataset, downsampled)" doc: | - Silhouette scores - 'sd:visualPlugins': + Composition plot colored by cell cycle phase. + Split by dataset; downsampled to the smallest + dataset. + PNG format + "sd:visualPlugins": + - image: + tab: "Per dataset" + Caption: "Composition plot colored by cell cycle phase (split by dataset, downsampled)" + + umap_gr_ph_spl_cnd_plot_png: + type: File? + outputSource: sc_rna_cluster/umap_gr_ph_spl_cnd_plot_png + label: "UMAP colored by cell cycle phase (split by grouping condition, downsampled)" + doc: | + UMAP colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": + - image: + tab: "Per group" + Caption: "UMAP colored by cell cycle phase (split by grouping condition, downsampled)" + + cmp_gr_ph_spl_cnd_plot_png: + type: File? + outputSource: sc_rna_cluster/cmp_gr_ph_spl_cnd_plot_png + label: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" + doc: | + Composition plot colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per cluster' - Caption: 'Silhouette scores' + tab: "Per group" + Caption: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" - umap_spl_ph_res_plot_png: + umap_gr_clst_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_rna_cluster/umap_spl_ph_res_plot_png - label: "UMAP, colored by cluster, split by cell cycle phase" + outputSource: sc_rna_cluster/umap_gr_clst_res_plot_png + label: "UMAP colored by cluster (all cells)" doc: | - UMAP, colored by cluster, - split by cell cycle phase - 'sd:visualPlugins': + UMAP colored by cluster. + All cells. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per cluster' - Caption: 'UMAP, colored by cluster, split by cell cycle phase' + tab: "Per cluster" + Caption: "UMAP colored by cluster (all cells)" - cmp_gr_ph_spl_clst_res_plot_png: + slh_gr_clst_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_rna_cluster/cmp_gr_ph_spl_clst_res_plot_png - label: "Composition plot, colored by cell cycle phase, split by cluster, downsampled" + outputSource: sc_rna_cluster/slh_gr_clst_res_plot_png + label: "Silhouette scores (all cells)" doc: | - Composition plot, colored by - cell cycle phase, split by - cluster, downsampled - 'sd:visualPlugins': + Silhouette scores. + All cells. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per cluster' - Caption: 'Composition plot, colored by cell cycle phase, split by cluster, downsampled' + tab: "Per cluster" + Caption: "Silhouette scores (all cells)" - umap_spl_idnt_res_plot_png: + umap_gr_clst_spl_idnt_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_rna_cluster/umap_spl_idnt_res_plot_png - label: "UMAP, colored by cluster, split by dataset" + outputSource: sc_rna_cluster/umap_gr_clst_spl_idnt_res_plot_png + label: "UMAP colored by cluster (split by dataset, downsampled)" doc: | - UMAP, colored by cluster, - split by dataset - 'sd:visualPlugins': + UMAP colored by cluster. + Split by dataset; downsampled + to the smallest dataset. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by cluster, split by dataset' + tab: "Per dataset" + Caption: "UMAP colored by cluster (split by dataset, downsampled)" cmp_gr_clst_spl_idnt_res_plot_png: type: @@ -231,15 +265,16 @@ outputs: - type: array items: File outputSource: sc_rna_cluster/cmp_gr_clst_spl_idnt_res_plot_png - label: "Composition plot, colored by cluster, split by dataset, downsampled" + label: "Composition plot colored by cluster (split by dataset, downsampled)" doc: | - Composition plot, colored by - cluster, split by dataset, - downsampled - 'sd:visualPlugins': + Composition plot colored by cluster. + Split by dataset; downsampled + to the smallest dataset. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Composition plot, colored by cluster, split by dataset, downsampled' + tab: "Per dataset" + Caption: "Composition plot colored by cluster (split by dataset, downsampled)" cmp_gr_idnt_spl_clst_res_plot_png: type: @@ -247,43 +282,70 @@ outputs: - type: array items: File outputSource: sc_rna_cluster/cmp_gr_idnt_spl_clst_res_plot_png - label: "Composition plot, colored by dataset, split by cluster, downsampled" + label: "Composition plot colored by dataset (split by cluster, downsampled)" doc: | - Composition plot, colored by - dataset, split by cluster, - downsampled - 'sd:visualPlugins': + Composition plot colored by dataset. + Split by cluster; downsampled to the + smallest dataset. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Composition plot, colored by dataset, split by cluster, downsampled' + tab: "Per dataset" + Caption: "Composition plot colored by dataset (split by cluster, downsampled)" - cmp_gr_ph_spl_idnt_plot_png: - type: File? - outputSource: sc_rna_cluster/cmp_gr_ph_spl_idnt_plot_png - label: "Composition plot, colored by cell cycle phase, split by dataset, downsampled" - doc: | - Composition plot, colored by - cell cycle phase, split by - dataset, downsampled - 'sd:visualPlugins': + umap_gr_clst_spl_ph_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/umap_gr_clst_spl_ph_res_plot_png + label: "UMAP colored by cluster (split by cell cycle phase, optionally downsampled)" + doc: | + UMAP colored by cluster. + Split by cell cycle phase; downsampled + to the smallest dataset (if multiple + datasets are analyzed jointly). + PNG format. + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Composition plot, colored by cell cycle phase, split by dataset, downsampled' + tab: "Per cluster" + Caption: "UMAP colored by cluster (split by cell cycle phase, optionally downsampled)" - umap_spl_cnd_res_plot_png: + cmp_gr_ph_spl_clst_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_rna_cluster/umap_spl_cnd_res_plot_png - label: "UMAP, colored by cluster, split by grouping condition" + outputSource: sc_rna_cluster/cmp_gr_ph_spl_clst_res_plot_png + label: "Composition plot colored by cell cycle phase (split by cluster, optionally downsampled)" doc: | - UMAP, colored by cluster, - split by grouping condition - 'sd:visualPlugins': + Composition plot colored by cell cycle phase. + Split by cluster; downsampled to the smallest + dataset (if multiple datasets are analyzed + jointly). + PNG format + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by cluster, split by grouping condition' + tab: "Per cluster" + Caption: "Composition plot colored by cell cycle phase (split by cluster, optionally downsampled)" + + umap_gr_clst_spl_cnd_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/umap_gr_clst_spl_cnd_res_plot_png + label: "UMAP colored by cluster (split by grouping condition, downsampled)" + doc: | + UMAP colored by cluster. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": + - image: + tab: "Per group" + Caption: "UMAP colored by cluster (split by grouping condition, downsampled)" cmp_gr_clst_spl_cnd_res_plot_png: type: @@ -291,15 +353,17 @@ outputs: - type: array items: File outputSource: sc_rna_cluster/cmp_gr_clst_spl_cnd_res_plot_png - label: "Composition plot, colored by cluster, split by grouping condition, downsampled" + label: "Composition plot colored by cluster (split by grouping condition, downsampled)" doc: | - Composition plot, colored by - cluster, split by grouping - condition, downsampled - 'sd:visualPlugins': + Composition plot colored by cluster. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'Composition plot, colored by cluster, split by grouping condition, downsampled' + tab: "Per group" + Caption: "Composition plot colored by cluster (split by grouping condition, downsampled)" cmp_gr_cnd_spl_clst_res_plot_png: type: @@ -307,57 +371,64 @@ outputs: - type: array items: File outputSource: sc_rna_cluster/cmp_gr_cnd_spl_clst_res_plot_png - label: "Composition plot, colored by grouping condition, split by cluster, downsampled" + label: "Composition plot colored by grouping condition (split by cluster, downsampled)" doc: | - Composition plot, colored by - grouping condition, split by - cluster, downsampled - 'sd:visualPlugins': + Composition plot colored by grouping condition. + Split by cluster; first downsampled to the + smallest dataset, then downsampled to the + smallest group. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'Composition plot, colored by grouping condition, split by cluster, downsampled' + tab: "Per group" + Caption: "Composition plot colored by grouping condition (split by cluster, downsampled)" - xpr_avg_res_plot_png: + xpr_per_cell_plot_png: type: - "null" - type: array items: File - outputSource: sc_rna_cluster/xpr_avg_res_plot_png - label: "Gene expression dot plot" + outputSource: sc_rna_cluster/xpr_per_cell_plot_png + label: "UMAP colored by gene expression (per gene)" doc: | - Gene expression dot plot - 'sd:visualPlugins': + UMAP colored by gene expression. + All genes of interest. + PNG format. + "sd:visualPlugins": - image: - tab: 'Genes of interest' - Caption: 'Gene expression dot plot' + tab: "Gene expression" + Caption: "UMAP colored by gene expression (per gene)" - xpr_dnst_res_plot_png: + xpr_avg_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_rna_cluster/xpr_dnst_res_plot_png - label: "Gene expression violin plot" + outputSource: sc_rna_cluster/xpr_avg_res_plot_png + label: "Average gene expression" doc: | - Gene expression violin plot - 'sd:visualPlugins': + Average gene expression. + PNG format. + "sd:visualPlugins": - image: - tab: 'Genes of interest' - Caption: 'Gene expression violin plot' + tab: "Gene expression" + Caption: "Average gene expression" - xpr_per_cell_plot_png: + xpr_dnst_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_rna_cluster/xpr_per_cell_plot_png - label: "UMAP, gene expression" + outputSource: sc_rna_cluster/xpr_dnst_res_plot_png + label: "Gene expression density (per gene)" doc: | - UMAP, gene expression - 'sd:visualPlugins': + Gene expression density. + All genes of interest. + PNG format. + "sd:visualPlugins": - image: - tab: 'Genes of interest' - Caption: 'UMAP, gene expression' + tab: "Gene expression" + Caption: "Gene expression density (per gene)" xpr_htmp_res_plot_png: type: @@ -365,13 +436,15 @@ outputs: - type: array items: File outputSource: sc_rna_cluster/xpr_htmp_res_plot_png - label: "Gene expression heatmap" + label: "Gene expression heatmap (top gene markers)" doc: | - Gene expression heatmap - 'sd:visualPlugins': + Gene expression heatmap. + Top gene markers. + PNG format. + "sd:visualPlugins": - image: - tab: 'Heatmap' - Caption: 'Gene expression heatmap' + tab: "Gene expression heatmap" + Caption: "Gene expression heatmap (top gene markers)" xpr_htmp_res_tsv: type: @@ -379,37 +452,37 @@ outputs: - type: array items: File outputSource: sc_rna_cluster/xpr_htmp_res_tsv - label: "Markers from gene expression heatmap" + label: "Gene expression heatmap (top gene markers)" doc: | - Gene markers used for gene - expression heatmap + Gene expression heatmap. + Top gene markers. + TSV format. gene_markers_tsv: type: File? outputSource: sc_rna_cluster/gene_markers_tsv - label: "Gene markers per cluster for all resolutions" + label: "Gene markers" doc: | - Gene markers per cluster for - all resolutions - 'sd:visualPlugins': + Gene markers. + TSV format. + "sd:visualPlugins": - syncfusiongrid: - tab: 'Gene markers' - Title: 'Gene markers per cluster for all resolutions' + tab: "Gene markers" + Title: "Gene markers" ucsc_cb_html_data: type: Directory? outputSource: sc_rna_cluster/ucsc_cb_html_data - label: "UCSC Cell Browser data" + label: "UCSC Cell Browser (data)" doc: | - Directory with UCSC Cell Browser - data + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputSource: sc_rna_cluster/ucsc_cb_html_file label: "UCSC Cell Browser" doc: | - UCSC Cell Browser HTML index file + UCSC Cell Browser html index. "sd:visualPlugins": - linkList: tab: "Overview" @@ -418,38 +491,40 @@ outputs: seurat_data_rds: type: File outputSource: sc_rna_cluster/seurat_data_rds - label: "Processed Seurat data in RDS format" + label: "Seurat object in RDS format" doc: | - Processed Seurat data in RDS format + Seurat object. + RDS format. seurat_data_scope: type: File? outputSource: sc_rna_cluster/seurat_data_scope - label: "Processed Seurat data in SCope compatible loom format" + label: "Seurat object in SCope compatible loom format" doc: | - Processed Seurat data in SCope compatible loom format + Seurat object. + SCope compatible. + Loom format. pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder - label: "Plots in PDF format" + label: "Compressed folder with all PDF plots" doc: | - Compressed folder with plots - in PDF format + Compressed folder with all PDF plots. sc_rna_cluster_stdout_log: type: File outputSource: sc_rna_cluster/stdout_log - label: "stdout log generated by sc_rna_cluster step" + label: "Output log" doc: | - stdout log generated by sc_rna_cluster step + Stdout log from the sc_rna_cluster step. sc_rna_cluster_stderr_log: type: File outputSource: sc_rna_cluster/stderr_log - label: "stderr log generated by sc_rna_cluster step" + label: "Error log" doc: | - stderr log generated by sc_rna_cluster step + Stderr log from the sc_rna_cluster step. steps: @@ -495,35 +570,41 @@ steps: source: threads valueFrom: $(parseInt(self)) out: - - umap_res_plot_png - - slh_res_plot_png - - umap_spl_idnt_res_plot_png + - umap_gr_ph_spl_idnt_plot_png + - cmp_gr_ph_spl_idnt_plot_png + - umap_gr_ph_spl_cnd_plot_png + - cmp_gr_ph_spl_cnd_plot_png + - umap_gr_clst_res_plot_png + - slh_gr_clst_res_plot_png + - umap_gr_clst_spl_idnt_res_plot_png - cmp_gr_clst_spl_idnt_res_plot_png - cmp_gr_idnt_spl_clst_res_plot_png - - umap_spl_cnd_res_plot_png + - umap_gr_clst_spl_ph_res_plot_png + - cmp_gr_ph_spl_clst_res_plot_png + - umap_gr_clst_spl_cnd_res_plot_png - cmp_gr_clst_spl_cnd_res_plot_png - cmp_gr_cnd_spl_clst_res_plot_png - - umap_spl_ph_res_plot_png - - cmp_gr_ph_spl_idnt_plot_png - - cmp_gr_ph_spl_clst_res_plot_png - - xpr_avg_res_plot_png - xpr_per_cell_plot_png + - xpr_avg_res_plot_png - xpr_dnst_res_plot_png - xpr_htmp_res_plot_png - - umap_res_plot_pdf - - slh_res_plot_pdf - - umap_spl_idnt_res_plot_pdf + - umap_gr_ph_spl_idnt_plot_pdf + - cmp_gr_ph_spl_idnt_plot_pdf + - umap_gr_ph_spl_cnd_plot_pdf + - cmp_gr_ph_spl_cnd_plot_pdf + - umap_gr_clst_res_plot_pdf + - slh_gr_clst_res_plot_pdf + - umap_gr_clst_spl_idnt_res_plot_pdf - cmp_gr_clst_spl_idnt_res_plot_pdf - cmp_gr_idnt_spl_clst_res_plot_pdf - - umap_spl_cnd_res_plot_pdf + - umap_gr_clst_spl_ph_res_plot_pdf + - cmp_gr_ph_spl_clst_res_plot_pdf + - umap_gr_clst_spl_cnd_res_plot_pdf - cmp_gr_clst_spl_cnd_res_plot_pdf - cmp_gr_cnd_spl_clst_res_plot_pdf - - umap_spl_ph_res_plot_pdf - - cmp_gr_ph_spl_idnt_plot_pdf - - cmp_gr_ph_spl_clst_res_plot_pdf - - xpr_avg_res_plot_pdf - xpr_per_cell_plot_pdf - xpr_per_cell_sgnl_plot_pdf + - xpr_avg_res_plot_pdf - xpr_dnst_res_plot_pdf - xpr_htmp_res_plot_pdf - xpr_htmp_res_tsv @@ -540,20 +621,23 @@ steps: in: input_files: source: - - sc_rna_cluster/umap_res_plot_pdf - - sc_rna_cluster/slh_res_plot_pdf - - sc_rna_cluster/umap_spl_idnt_res_plot_pdf + - sc_rna_cluster/umap_gr_ph_spl_idnt_plot_pdf + - sc_rna_cluster/cmp_gr_ph_spl_idnt_plot_pdf + - sc_rna_cluster/umap_gr_ph_spl_cnd_plot_pdf + - sc_rna_cluster/cmp_gr_ph_spl_cnd_plot_pdf + - sc_rna_cluster/umap_gr_clst_res_plot_pdf + - sc_rna_cluster/slh_gr_clst_res_plot_pdf + - sc_rna_cluster/umap_gr_clst_spl_idnt_res_plot_pdf - sc_rna_cluster/cmp_gr_clst_spl_idnt_res_plot_pdf - sc_rna_cluster/cmp_gr_idnt_spl_clst_res_plot_pdf - - sc_rna_cluster/umap_spl_cnd_res_plot_pdf + - sc_rna_cluster/umap_gr_clst_spl_ph_res_plot_pdf + - sc_rna_cluster/cmp_gr_ph_spl_clst_res_plot_pdf + - sc_rna_cluster/umap_gr_clst_spl_cnd_res_plot_pdf - sc_rna_cluster/cmp_gr_clst_spl_cnd_res_plot_pdf - sc_rna_cluster/cmp_gr_cnd_spl_clst_res_plot_pdf - - sc_rna_cluster/umap_spl_ph_res_plot_pdf - - sc_rna_cluster/cmp_gr_ph_spl_idnt_plot_pdf - - sc_rna_cluster/cmp_gr_ph_spl_clst_res_plot_pdf - - sc_rna_cluster/xpr_avg_res_plot_pdf - sc_rna_cluster/xpr_per_cell_plot_pdf - sc_rna_cluster/xpr_per_cell_sgnl_plot_pdf + - sc_rna_cluster/xpr_avg_res_plot_pdf - sc_rna_cluster/xpr_dnst_res_plot_pdf - sc_rna_cluster/xpr_htmp_res_plot_pdf valueFrom: $(self.flat().filter(n => n)) diff --git a/workflows/sc-rna-da-cells.cwl b/workflows/sc-rna-da-cells.cwl index 598c8733..08ec4421 100644 --- a/workflows/sc-rna-da-cells.cwl +++ b/workflows/sc-rna-da-cells.cwl @@ -14,7 +14,7 @@ requirements: }; -'sd:upstream': +"sd:upstream": sc_tools_sample: - "sc-rna-reduce.cwl" - "sc-atac-reduce.cwl" @@ -41,8 +41,8 @@ inputs: expression information stored in the RNA assay and selected with the --reduction parameter dimensionality reduction. Additionally, 'rnaumap', and/or 'atacumap', and/or 'wnnumap' dimensionality reductions should be present. - 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" - 'sd:localLabel': true + "sd:upstreamSource": "sc_tools_sample/seurat_data_rds" + "sd:localLabel": true splitby: type: string @@ -123,7 +123,7 @@ inputs: Color theme for all generated plots. One of gray, bw, linedraw, light, dark, minimal, classic, void. Default: classic - 'sd:layout': + "sd:layout": advanced: true threads: @@ -137,14 +137,14 @@ inputs: - "4" - "5" - "6" - default: "1" + default: "6" label: "Number of cores/cpus to use" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 1 - 'sd:layout': + Default: 6 + "sd:layout": advanced: true @@ -158,7 +158,7 @@ outputs: DA scores random permutations plot for second vs first biological conditions comparison. PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: tab: 'Overall' Caption: 'DA scores random permutations plot' @@ -173,7 +173,7 @@ outputs: doc: | Clustered DA cells subpopulations UMAP (rnaumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: tab: 'Overall' Caption: 'Clustered DA cells subpopulations RNA UMAP' @@ -188,7 +188,7 @@ outputs: doc: | Clustered DA cells subpopulations UMAP (atacumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: tab: 'Overall' Caption: 'Clustered DA cells subpopulations ATAC UMAP' @@ -203,7 +203,7 @@ outputs: doc: | Clustered DA cells subpopulations UMAP (wnnumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: tab: 'Overall' Caption: 'Clustered DA cells subpopulations WNN UMAP' @@ -219,9 +219,9 @@ outputs: Split by grouping condition clustered DA cells subpopulations UMAP (rnaumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' + tab: "Per group" Caption: 'Split by grouping condition clustered DA cells subpopulations RNA UMAP' umap_spl_cnd_rd_atacumap_res_plot_png: @@ -235,9 +235,9 @@ outputs: Split by grouping condition clustered DA cells subpopulations UMAP (atacumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' + tab: "Per group" Caption: 'Split by grouping condition clustered DA cells subpopulations ATAC UMAP' umap_spl_cnd_rd_wnnumap_res_plot_png: @@ -251,9 +251,9 @@ outputs: Split by grouping condition clustered DA cells subpopulations UMAP (wnnumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' + tab: "Per group" Caption: 'Split by grouping condition clustered DA cells subpopulations WNN UMAP' umap_spl_idnt_rd_rnaumap_da_scr_plot_png: @@ -264,9 +264,9 @@ outputs: Split by dataset cells UMAP with DA scores for second vs first biological conditions comparison (rnaumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' + tab: "Per dataset" Caption: 'Split by dataset cells RNA UMAP with DA scores' umap_spl_idnt_rd_atacumap_da_scr_plot_png: @@ -277,9 +277,9 @@ outputs: Split by dataset cells UMAP with DA scores for second vs first biological conditions comparison (atacumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' + tab: "Per dataset" Caption: 'Split by dataset cells ATAC UMAP with DA scores' umap_spl_idnt_rd_wnnumap_da_scr_plot_png: @@ -290,57 +290,57 @@ outputs: Split by dataset cells UMAP with DA scores for second vs first biological conditions comparison (wnnumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' + tab: "Per dataset" Caption: 'Split by dataset cells WNN UMAP with DA scores' ucsc_cb_html_data: type: Directory outputSource: da_cells/ucsc_cb_html_data - label: "Directory with UCSC Cellbrowser html data" + label: "UCSC Cell Browser (data)" doc: | - Directory with UCSC Cellbrowser html data. + UCSC Cell Browser html data. ucsc_cb_html_file: type: File outputSource: da_cells/ucsc_cb_html_file - label: "Open in UCSC Cell Browser" + label: "UCSC Cell Browser" doc: | - HTML index file from the directory with UCSC Cellbrowser html data. - 'sd:visualPlugins': + UCSC Cell Browser html index. + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" seurat_data_rds: type: File outputSource: da_cells/seurat_data_rds - label: "Processed Seurat data in RDS format" + label: "Seurat object in RDS format" doc: | - Processed Seurat data in RDS format + Seurat object. + RDS format. pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder - label: "Plots in PDF format" + label: "Compressed folder with all PDF plots" doc: | - Compressed folder with plots - in PDF format + Compressed folder with all PDF plots. da_cells_stdout_log: type: File outputSource: da_cells/stdout_log - label: "stdout log generated by da_cells step" + label: "Output log" doc: | - stdout log generated by da_cells step + Stdout log from the da_cells step. da_cells_stderr_log: type: File outputSource: da_cells/stderr_log - label: "stderr log generated by da_cells step" + label: "Error log" doc: | - stderr log generated by da_cells step + Stderr log from the da_cells step. steps: @@ -438,7 +438,7 @@ $schemas: label: "Single-Cell Differential Abundance Analysis" s:name: "Single-Cell Differential Abundance Analysis" -s:alternateName: "Compares the composition of cell types between two tested conditions" +s:alternateName: "Detects cell subpopulations with differential abundance between datasets split by biological condition" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-da-cells.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium diff --git a/workflows/sc-rna-de-pseudobulk.cwl b/workflows/sc-rna-de-pseudobulk.cwl index 344920de..c2332196 100644 --- a/workflows/sc-rna-de-pseudobulk.cwl +++ b/workflows/sc-rna-de-pseudobulk.cwl @@ -24,7 +24,7 @@ requirements: }; -'sd:upstream': +"sd:upstream": sc_tools_sample: - "sc-rna-cluster.cwl" - "sc-ctype-assign.cwl" @@ -52,8 +52,8 @@ inputs: Cluster Analysis", or "Single-Cell WNN Cluster Analysis" at any of the processing stages. - 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" - 'sd:localLabel': true + "sd:upstreamSource": "sc_tools_sample/seurat_data_rds" + "sd:localLabel": true groupby: type: string? @@ -280,13 +280,13 @@ inputs: - "4" - "5" - "6" - default: "1" + default: "6" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 1 + Default: 6 "sd:layout": advanced: true @@ -301,9 +301,9 @@ outputs: MDS plot of pseudobulk aggregated not filtered normalized reads counts in HTML format - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" heatmap_html: @@ -312,9 +312,9 @@ outputs: label: "Heatmap" doc: | Morpheus heatmap in HTML format - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" volcano_plot_html_file: @@ -323,9 +323,9 @@ outputs: label: "Volcano Plot" doc: | HTML index file for Volcano Plot - 'sd:visualPlugins': + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" volcano_plot_html_data: @@ -342,7 +342,7 @@ outputs: doc: | Gene expression PCA (1,2) in PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: tab: 'QC' Caption: 'Gene expression PCA (1,2)' @@ -354,7 +354,7 @@ outputs: doc: | Gene expression PCA (2,3) in PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: tab: 'QC' Caption: 'Gene expression PCA (2,3)' @@ -370,7 +370,7 @@ outputs: with "Subsetting values (optional)" from the "Subsetting category (optional)", RNA PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: tab: 'QC' Caption: 'UMAP, split by comparison category, RNA' @@ -386,7 +386,7 @@ outputs: with "Subsetting values (optional)" from the "Subsetting category (optional)", ATAC PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: tab: 'QC' Caption: 'UMAP, split by comparison category, ATAC' @@ -402,7 +402,7 @@ outputs: with "Subsetting values (optional)" from the "Subsetting category (optional)", WNN PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: tab: 'QC' Caption: 'UMAP, split by comparison category, WNN' @@ -416,9 +416,9 @@ outputs: genes. Highlighed genes are either provided by user or top 10 genes with the highest log2FoldChange values. PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Genes of interest' + tab: "Genes of interest" Caption: 'Volcano plot of differentially expressed genes' xpr_dnst_plot_png: @@ -431,9 +431,9 @@ outputs: differentially expressed genes with the highest log2FoldChange values in PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Genes of interest' + tab: "Genes of interest" Caption: 'Gene expression violin plot' xpr_htmp_plot_png: @@ -445,7 +445,7 @@ outputs: by adjusted P-value, optionally subsetted to the specific groups of cells in PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: tab: 'Heatmap' Caption: 'Gene expression heatmap' @@ -461,7 +461,7 @@ outputs: UMAP, gene expression, split by selected criteria, optionally subsetted to the specific group, RNA, PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: tab: 'Gene expression, RNA' Caption: 'UMAP, gene expression, RNA' @@ -477,7 +477,7 @@ outputs: UMAP, gene expression, split by selected criteria, optionally subsetted to the specific group, ATAC, PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: tab: 'Gene expression, ATAC' Caption: 'UMAP, gene expression, ATAC' @@ -493,7 +493,7 @@ outputs: UMAP, gene expression, split by selected criteria, optionally subsetted to the specific group, WNN, PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: tab: 'Gene expression, WNN' Caption: 'UMAP, gene expression, WNN' @@ -506,7 +506,7 @@ outputs: Not filtered by adjusted P-value differentially expressed genes in TSV format - 'sd:visualPlugins': + "sd:visualPlugins": - syncfusiongrid: tab: 'Diff. expressed genes' Title: 'Differentially expressed genes' @@ -539,36 +539,23 @@ outputs: pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder - label: "Plots in PDF format" + label: "Compressed folder with all PDF plots" doc: | - Compressed folder with plots - in PDF format + Compressed folder with all PDF plots. de_pseudobulk_stdout_log: type: File outputSource: de_pseudobulk/stdout_log - label: "stdout log generated by de_pseudobulk step" + label: "Output log" doc: | - stdout log generated by de_pseudobulk step + Stdout log from the de_pseudobulk step. de_pseudobulk_stderr_log: type: File outputSource: de_pseudobulk/stderr_log - label: "stderr log generated by de_pseudobulk step" + label: "Error log" doc: | - stderr log generated by de_pseudobulk step - - morpheus_heatmap_stdout_log: - type: File - outputSource: morpheus_heatmap/stdout_log - label: "stdout log generated by morpheus_heatmap step" - doc: "stdout log generated by morpheus_heatmap step" - - morpheus_heatmap_stderr_log: - type: File - outputSource: morpheus_heatmap/stderr_log - label: "stderr log generated by morpheus_heatmap step" - doc: "stderr log generated by morpheus_heatmap step" + Stderr log from the de_pseudobulk step. steps: @@ -720,8 +707,6 @@ steps: read_counts_gct: de_pseudobulk/cell_read_counts_gct out: - heatmap_html - - stdout_log - - stderr_log make_volcano_plot: run: ../tools/volcano-plot.cwl @@ -746,7 +731,7 @@ $schemas: label: "Single-Cell RNA-Seq Differential Expression Analysis" s:name: "Single-Cell RNA-Seq Differential Expression Analysis" -s:alternateName: "Identifies differentially expressed genes between any two groups of cells" +s:alternateName: "Identifies differentially expressed genes between two groups of cells optionally coerced to pseudobulk form" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-de-pseudobulk.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index 05f278a6..068db192 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -117,7 +117,10 @@ inputs: order from the "aggregation_metadata.csv" output generated by "Cell Ranger RNA or RNA+VDJ Sample" and accessible on the - "Files" tab. + "Files" tab. Any 0 will be replaced + with the auto-estimated threshold + (median - 2.5 * MAD) calculated per + dataset. Default: 500 "sd:layout": advanced: true @@ -142,7 +145,9 @@ inputs: order from the "aggregation_metadata.csv" output generated by "Cell Ranger RNA or RNA+VDJ Sample" and accessible on the - "Files" tab. + "Files" tab. Any 0 will be replaced with + the auto-estimated threshold (median - + - 2.5 * MAD) calculated per dataset. Default: 250 "sd:layout": advanced: true @@ -167,7 +172,9 @@ inputs: order from the "aggregation_metadata.csv" output generated by "Cell Ranger RNA or RNA+VDJ Sample" and accessible on the - "Files" tab. + "Files" tab. Any 0 will be replaced with + the auto-estimated threshold (median + + + 5 * MAD) calculated per dataset. Default: 5000 "sd:layout": advanced: true @@ -192,7 +199,10 @@ inputs: to exclude from the analysis all cells with the percentage of RNA reads mapped to mitochondrial genes exceeding - the provided value. + the provided value. Set to 0 for using + an auto-estimated threshold equal to + the maximum among (median + 2 * MAD) + values calculated per dataset. Default: 5 "sd:layout": advanced: true @@ -259,13 +269,13 @@ inputs: - "4" - "5" - "6" - default: "1" + default: "6" label: "Number of cores/cpus to use" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 1 + Default: 6 "sd:layout": advanced: true @@ -344,6 +354,18 @@ outputs: tab: "Raw" Caption: "Genes vs RNA reads" + raw_umi_mito_plot_png: + type: File? + outputSource: sc_rna_filter/raw_umi_mito_plot_png + label: "RNA reads vs mitochondrial %, raw" + doc: | + RNA reads vs mitochondrial % per cell + for raw data + "sd:visualPlugins": + - image: + tab: "Raw" + Caption: "RNA reads vs mitochondrial %" + raw_mito_dnst_plot_png: type: File? outputSource: sc_rna_filter/raw_mito_dnst_plot_png @@ -507,6 +529,18 @@ outputs: fltr_gene_umi_plot_png: type: File? outputSource: sc_rna_filter/fltr_gene_umi_plot_png + label: "RNA reads vs mitochondrial %, filtered" + doc: | + RNA reads vs mitochondrial % per cell + for filtered data + "sd:visualPlugins": + - image: + tab: "Filtered" + Caption: "RNA reads vs mitochondrial %" + + fltr_umi_mito_plot_png: + type: File? + outputSource: sc_rna_filter/fltr_umi_mito_plot_png label: "Genes vs RNA reads, filtered" doc: | Genes vs RNA reads per cell @@ -619,17 +653,16 @@ outputs: ucsc_cb_html_data: type: Directory outputSource: sc_rna_filter/ucsc_cb_html_data - label: "UCSC Cell Browser data" + label: "UCSC Cell Browser (data)" doc: | - Directory with UCSC Cell Browser - data + UCSC Cell Browser html data. ucsc_cb_html_file: type: File outputSource: sc_rna_filter/ucsc_cb_html_file label: "UCSC Cell Browser" doc: | - UCSC Cell Browser HTML index file + UCSC Cell Browser html index. "sd:visualPlugins": - linkList: tab: "Overview" @@ -638,9 +671,10 @@ outputs: seurat_data_rds: type: File outputSource: sc_rna_filter/seurat_data_rds - label: "Processed seurat data in RDS format" + label: "Seurat object in RDS format" doc: | - Processed seurat data in RDS format + Seurat object. + RDS format. datasets_metadata: type: File @@ -653,26 +687,23 @@ outputs: pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder - label: "Plots in PDF format" + label: "Compressed folder with all PDF plots" doc: | - Compressed folder with plots - in PDF format + Compressed folder with all PDF plots. sc_rna_filter_stdout_log: type: File outputSource: sc_rna_filter/stdout_log - label: "Output log, filtering step" + label: "Output log" doc: | - stdout log generated by - sc_rna_filter step + Stdout log from the sc_rna_filter step. sc_rna_filter_stderr_log: type: File outputSource: sc_rna_filter/stderr_log - label: "Error log, filtering step" + label: "Error log" doc: | - stderr log generated by - sc_rna_filter step + Stderr log from the sc_rna_filter step. steps: @@ -733,6 +764,7 @@ steps: - raw_umi_dnst_plot_png - raw_gene_dnst_plot_png - raw_gene_umi_plot_png + - raw_umi_mito_plot_png - raw_mito_dnst_plot_png - raw_nvlt_dnst_plot_png - raw_qc_mtrcs_dnst_plot_png @@ -747,6 +779,7 @@ steps: - fltr_umi_dnst_plot_png - fltr_gene_dnst_plot_png - fltr_gene_umi_plot_png + - fltr_umi_mito_plot_png - fltr_mito_dnst_plot_png - fltr_nvlt_dnst_plot_png - fltr_qc_mtrcs_dnst_plot_png @@ -761,6 +794,7 @@ steps: - raw_umi_dnst_plot_pdf - raw_gene_dnst_plot_pdf - raw_gene_umi_plot_pdf + - raw_umi_mito_plot_pdf - raw_mito_dnst_plot_pdf - raw_nvlt_dnst_plot_pdf - raw_qc_mtrcs_dnst_plot_pdf @@ -775,6 +809,7 @@ steps: - fltr_umi_dnst_plot_pdf - fltr_gene_dnst_plot_pdf - fltr_gene_umi_plot_pdf + - fltr_umi_mito_plot_pdf - fltr_mito_dnst_plot_pdf - fltr_nvlt_dnst_plot_pdf - fltr_qc_mtrcs_dnst_plot_pdf @@ -801,6 +836,7 @@ steps: - sc_rna_filter/raw_umi_dnst_plot_pdf - sc_rna_filter/raw_gene_dnst_plot_pdf - sc_rna_filter/raw_gene_umi_plot_pdf + - sc_rna_filter/raw_umi_mito_plot_pdf - sc_rna_filter/raw_mito_dnst_plot_pdf - sc_rna_filter/raw_nvlt_dnst_plot_pdf - sc_rna_filter/raw_qc_mtrcs_dnst_plot_pdf @@ -815,6 +851,7 @@ steps: - sc_rna_filter/fltr_umi_dnst_plot_pdf - sc_rna_filter/fltr_gene_dnst_plot_pdf - sc_rna_filter/fltr_gene_umi_plot_pdf + - sc_rna_filter/fltr_umi_mito_plot_pdf - sc_rna_filter/fltr_mito_dnst_plot_pdf - sc_rna_filter/fltr_nvlt_dnst_plot_pdf - sc_rna_filter/fltr_qc_mtrcs_dnst_plot_pdf @@ -845,7 +882,7 @@ $schemas: label: "Single-Cell RNA-Seq Filtering Analysis" s:name: "Single-Cell RNA-Seq Filtering Analysis" -s:alternateName: "Removes low-quality cells" +s:alternateName: "Filters single-cell RNA-Seq datasets based on the common QC metrics" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-filter.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 8216bf8b..e180b59d 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -17,7 +17,7 @@ requirements: }; -'sd:upstream': +"sd:upstream": sc_tools_sample: - "sc-atac-cluster.cwl" - "sc-atac-reduce.cwl" @@ -41,8 +41,8 @@ inputs: multiome ATAC and RNA-Seq or just RNA-Seq datasets filtered by QC metrics to include only high-quality cells. - 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" - 'sd:localLabel': true + "sd:upstreamSource": "sc_tools_sample/seurat_data_rds" + "sd:localLabel": true normalization_method: type: @@ -118,7 +118,8 @@ inputs: doc: | Number of principal components to be used in PCA and UMAP projection. Accepted values - range from 1 to 50. + range from 1 to 50. Set to 0 to use + auto-estimated dimensionality. Default: 40 cell_cycle_data: @@ -243,7 +244,7 @@ inputs: datasets integration, and dimensionality reduction. Default: 3000 - 'sd:layout': + "sd:layout": advanced: true export_ucsc_cb: @@ -253,7 +254,7 @@ inputs: doc: | Export results into UCSC Cell Browser Default: false - 'sd:layout': + "sd:layout": advanced: true color_theme: @@ -289,13 +290,13 @@ inputs: - "4" - "5" - "6" - default: "1" + default: "6" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 1 + Default: 6 "sd:layout": advanced: true @@ -310,10 +311,10 @@ outputs: Elbow plot to evaluate the number of principal components that capture the majority of the variation in the data. - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'QC' - Caption: 'Elbow plot' + tab: "QC" + Caption: "Elbow plot" qc_dim_corr_plot_png: type: File? @@ -322,10 +323,10 @@ outputs: doc: | Correlation between QC metrics and principal components - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'QC' - Caption: 'Correlation between QC metrics and principal components' + tab: "QC" + Caption: "Correlation between QC metrics and principal components" umap_qc_mtrcs_plot_png: type: File? @@ -333,10 +334,10 @@ outputs: label: "UMAP, QC metrics" doc: | UMAP, QC metrics - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'QC' - Caption: 'UMAP, QC metrics' + tab: "QC" + Caption: "UMAP, QC metrics" ccpca_plot_png: type: File? @@ -344,10 +345,10 @@ outputs: label: "PCA, colored by cell cycle phase" doc: | PCA, colored by cell cycle phase - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'QC' - Caption: 'PCA, colored by cell cycle phase' + tab: "QC" + Caption: "PCA, colored by cell cycle phase" umap_plot_png: type: File? @@ -355,10 +356,10 @@ outputs: label: "UMAP, colored by dataset" doc: | UMAP, colored by dataset - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by dataset' + tab: "Per dataset" + Caption: "UMAP, colored by dataset" umap_spl_idnt_plot_png: type: File? @@ -366,10 +367,10 @@ outputs: label: "UMAP, split by dataset" doc: | UMAP, split by dataset - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, split by dataset' + tab: "Per dataset" + Caption: "UMAP, split by dataset" umap_spl_umi_plot_png: type: File? @@ -378,10 +379,10 @@ outputs: doc: | UMAP, colored by dataset, split by RNA reads per cell - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by dataset, split by RNA reads per cell' + tab: "Per dataset" + Caption: "UMAP, colored by dataset, split by RNA reads per cell" umap_spl_gene_plot_png: type: File? @@ -390,10 +391,10 @@ outputs: doc: | UMAP, colored by dataset, split by genes per cell - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by dataset, split by genes per cell' + tab: "Per dataset" + Caption: "UMAP, colored by dataset, split by genes per cell" umap_spl_mito_plot_png: type: File? @@ -402,10 +403,10 @@ outputs: doc: | UMAP, colored by dataset, split by mitochondrial percentage - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by dataset, split by mitochondrial percentage' + tab: "Per dataset" + Caption: "UMAP, colored by dataset, split by mitochondrial percentage" umap_spl_ph_plot_png: type: File? @@ -414,10 +415,10 @@ outputs: doc: | UMAP, colored by dataset, split by cell cycle phase - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by dataset, split by cell cycle phase' + tab: "Per dataset" + Caption: "UMAP, colored by dataset, split by cell cycle phase" ccpca_spl_idnt_plot_png: type: File? @@ -426,10 +427,10 @@ outputs: doc: | PCA, colored by cell cycle phase, split by dataset - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'PCA, colored by cell cycle phase, split by dataset' + tab: "Per dataset" + Caption: "PCA, colored by cell cycle phase, split by dataset" umap_spl_cnd_plot_png: type: File? @@ -438,10 +439,10 @@ outputs: doc: | UMAP, colored by dataset, split by grouping condition - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by dataset, split by grouping condition' + tab: "Per group" + Caption: "UMAP, colored by dataset, split by grouping condition" umap_gr_cnd_spl_umi_plot_png: type: File? @@ -450,10 +451,10 @@ outputs: doc: | UMAP, colored by grouping condition, split by RNA reads per cell - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by grouping condition, split by RNA reads per cell' + tab: "Per group" + Caption: "UMAP, colored by grouping condition, split by RNA reads per cell" umap_gr_cnd_spl_gene_plot_png: type: File? @@ -462,10 +463,10 @@ outputs: doc: | UMAP, colored by grouping condition, split by genes per cell - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by grouping condition, split by genes per cell' + tab: "Per group" + Caption: "UMAP, colored by grouping condition, split by genes per cell" umap_gr_cnd_spl_mito_plot_png: type: File? @@ -474,10 +475,10 @@ outputs: doc: | UMAP, colored by grouping condition, split by mitochondrial percentage - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by grouping condition, split by mitochondrial percentage' + tab: "Per group" + Caption: "UMAP, colored by grouping condition, split by mitochondrial percentage" umap_gr_cnd_spl_ph_plot_png: type: File? @@ -486,10 +487,10 @@ outputs: doc: | UMAP, colored by grouping condition, split by cell cycle phase - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by grouping condition, split by cell cycle phase' + tab: "Per group" + Caption: "UMAP, colored by grouping condition, split by cell cycle phase" ccpca_spl_cnd_plot_png: type: File? @@ -498,25 +499,24 @@ outputs: doc: | PCA, colored by cell cycle phase, split by grouping condition - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'PCA, colored by cell cycle phase, split by grouping condition' + tab: "Per group" + Caption: "PCA, colored by cell cycle phase, split by grouping condition" ucsc_cb_html_data: type: Directory? outputSource: sc_rna_reduce/ucsc_cb_html_data - label: "UCSC Cell Browser data" + label: "UCSC Cell Browser (data)" doc: | - Directory with UCSC Cell Browser - data + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputSource: sc_rna_reduce/ucsc_cb_html_file label: "UCSC Cell Browser" doc: | - UCSC Cell Browser HTML index file + UCSC Cell Browser html index. "sd:visualPlugins": - linkList: tab: "Overview" @@ -525,31 +525,31 @@ outputs: seurat_data_rds: type: File outputSource: sc_rna_reduce/seurat_data_rds - label: "Processed Seurat data in RDS format" + label: "Seurat object in RDS format" doc: | - Processed Seurat data in RDS format + Seurat object. + RDS format. pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder - label: "Plots in PDF format" + label: "Compressed folder with all PDF plots" doc: | - Compressed folder with plots - in PDF format + Compressed folder with all PDF plots. sc_rna_reduce_stdout_log: type: File outputSource: sc_rna_reduce/stdout_log - label: "stdout log generated by sc_rna_reduce step" + label: "Output log" doc: | - stdout log generated by sc_rna_reduce step + Stdout log from the sc_rna_reduce step. sc_rna_reduce_stderr_log: type: File outputSource: sc_rna_reduce/stderr_log - label: "stderr log generated by sc_rna_reduce step" + label: "Error log" doc: | - stderr log generated by sc_rna_reduce step + Stderr log from the sc_rna_reduce step. steps: @@ -705,7 +705,7 @@ $schemas: label: "Single-Cell RNA-Seq Dimensionality Reduction Analysis" s:name: "Single-Cell RNA-Seq Dimensionality Reduction Analysis" -s:alternateName: "Removes noise and confounding sources of variation by reducing dimensionality of gene expression data" +s:alternateName: "Integrates multiple single-cell RNA-Seq datasets, reduces dimensionality using PCA" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-reduce.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium diff --git a/workflows/sc-rna-trajectory.cwl b/workflows/sc-rna-trajectory.cwl index fef0f119..5ba0aa5e 100644 --- a/workflows/sc-rna-trajectory.cwl +++ b/workflows/sc-rna-trajectory.cwl @@ -17,7 +17,7 @@ requirements: }; -'sd:upstream': +"sd:upstream": sc_tools_sample: - "sc-rna-cluster.cwl" - "sc-atac-cluster.cwl" @@ -46,8 +46,8 @@ inputs: RNA-Seq Cluster Analysis", or "Single-Cell WNN Cluster Analysis" at any of the processing stages. - 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" - 'sd:localLabel': true + "sd:upstreamSource": "sc_tools_sample/seurat_data_rds" + "sd:localLabel": true dimensions: type: int? @@ -116,7 +116,7 @@ inputs: doc: | Export results into UCSC Cell Browser Default: false - 'sd:layout': + "sd:layout": advanced: true color_theme: @@ -152,13 +152,13 @@ inputs: - "4" - "5" - "6" - default: "1" + default: "6" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 1 + Default: 6 "sd:layout": advanced: true @@ -171,10 +171,10 @@ outputs: label: "Trajectory plot, colored by cluster" doc: | Trajectory plot, colored by cluster - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Trajectory' - Caption: 'Trajectory plot, colored by cluster' + tab: "Trajectory" + Caption: "Trajectory plot, colored by cluster" trjc_pstm_plot_png: type: File? @@ -182,10 +182,10 @@ outputs: label: "Trajectory plot, colored by pseudotime" doc: | Trajectory plot, colored by pseudotime - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Trajectory' - Caption: 'Trajectory plot, colored by pseudotime' + tab: "Trajectory" + Caption: "Trajectory plot, colored by pseudotime" dndr_gr_clst_plot_png: type: File? @@ -193,10 +193,10 @@ outputs: label: "Dendrogram plot, colored by cluster" doc: | Dendrogram plot, colored by cluster - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Trajectory' - Caption: 'Dendrogram plot, colored by cluster' + tab: "Trajectory" + Caption: "Dendrogram plot, colored by cluster" dndr_pstm_plot_png: type: File? @@ -204,10 +204,10 @@ outputs: label: "Dendrogram plot, colored by pseudotime" doc: | Dendrogram plot, colored by pseudotime - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Trajectory' - Caption: 'Dendrogram plot, colored by pseudotime' + tab: "Trajectory" + Caption: "Dendrogram plot, colored by pseudotime" grph_gr_clst_plot_png: type: File? @@ -215,10 +215,10 @@ outputs: label: "Trajectory graph, colored by cluster" doc: | Trajectory graph, colored by cluster - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Topology' - Caption: 'Trajectory graph, colored by cluster' + tab: "Topology" + Caption: "Trajectory graph, colored by cluster" grph_pstm_plot_png: type: File? @@ -226,10 +226,10 @@ outputs: label: "Trajectory graph, colored by pseudotime" doc: | Trajectory graph, colored by pseudotime - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Topology' - Caption: 'Trajectory graph, colored by pseudotime' + tab: "Topology" + Caption: "Trajectory graph, colored by pseudotime" tplg_plot_png: type: File? @@ -237,10 +237,10 @@ outputs: label: "Topology plot" doc: | Topology plot - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Topology' - Caption: 'Topology plot' + tab: "Topology" + Caption: "Topology plot" xpr_htmp_plot_png: type: File? @@ -248,10 +248,10 @@ outputs: label: "Gene expression heatmap" doc: | Gene expression heatmap - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Gene expression' - Caption: 'Gene expression heatmap' + tab: "Gene expression" + Caption: "Gene expression heatmap" xpr_pstm_plot_png: type: File? @@ -259,10 +259,10 @@ outputs: label: "Gene expression along pseudotime" doc: | Gene expression along pseudotime - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Gene expression' - Caption: 'Gene expression along pseudotime' + tab: "Gene expression" + Caption: "Gene expression along pseudotime" umap_rd_rnaumap_plot_png: type: File? @@ -270,10 +270,10 @@ outputs: label: "UMAP, colored by pseudotime, RNA" doc: | UMAP, colored by pseudotime, RNA - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Pseudotime' - Caption: 'UMAP, colored by pseudotime, RNA' + tab: "Pseudotime" + Caption: "UMAP, colored by pseudotime, RNA" umap_rd_atacumap_plot_png: type: File? @@ -281,10 +281,10 @@ outputs: label: "UMAP, colored by pseudotime, ATAC" doc: | UMAP, colored by pseudotime, ATAC - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Pseudotime' - Caption: 'UMAP, colored by pseudotime, ATAC' + tab: "Pseudotime" + Caption: "UMAP, colored by pseudotime, ATAC" umap_rd_wnnumap_plot_png: type: File? @@ -292,10 +292,10 @@ outputs: label: "UMAP, colored by pseudotime, WNN" doc: | UMAP, colored by pseudotime, WNN - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Pseudotime' - Caption: 'UMAP, colored by pseudotime, WNN' + tab: "Pseudotime" + Caption: "UMAP, colored by pseudotime, WNN" pstm_dnst_spl_idnt_plot_png: type: File? @@ -303,10 +303,10 @@ outputs: label: "Pseudotime density, split by dataset" doc: | Pseudotime density, split by dataset - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Pseudotime density, split by dataset' + tab: "Per dataset" + Caption: "Pseudotime density, split by dataset" pstm_hist_gr_clst_spl_idnt_plot_png: type: File? @@ -316,10 +316,10 @@ outputs: Pseudotime histogram, colored by cluster, split by dataset - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Pseudotime histogram, colored by cluster, split by dataset' + tab: "Per dataset" + Caption: "Pseudotime histogram, colored by cluster, split by dataset" umap_spl_idnt_rd_rnaumap_plot_png: type: File? @@ -328,10 +328,10 @@ outputs: doc: | UMAP, colored by pseudotime, split by dataset, RNA - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by pseudotime, split by dataset, RNA' + tab: "Per dataset" + Caption: "UMAP, colored by pseudotime, split by dataset, RNA" umap_spl_idnt_rd_atacumap_plot_png: type: File? @@ -340,10 +340,10 @@ outputs: doc: | UMAP, colored by pseudotime, split by dataset, ATAC - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by pseudotime, split by dataset, ATAC' + tab: "Per dataset" + Caption: "UMAP, colored by pseudotime, split by dataset, ATAC" umap_spl_idnt_rd_wnnumap_plot_png: type: File? @@ -352,10 +352,10 @@ outputs: doc: | UMAP, colored by pseudotime, split by dataset, WNN - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by pseudotime, split by dataset, WNN' + tab: "Per dataset" + Caption: "UMAP, colored by pseudotime, split by dataset, WNN" pstm_dnst_spl_cnd_plot_png: type: File? @@ -364,10 +364,10 @@ outputs: doc: | Pseudotime density, split by grouping condition - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'Pseudotime density, split by grouping condition' + tab: "Per group" + Caption: "Pseudotime density, split by grouping condition" pstm_hist_gr_clst_spl_cnd_plot_png: type: File? @@ -376,10 +376,10 @@ outputs: doc: | Pseudotime histogram, colored by cluster, split by grouping condition - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'Pseudotime histogram, colored by cluster, split by grouping condition' + tab: "Per group" + Caption: "Pseudotime histogram, colored by cluster, split by grouping condition" umap_spl_cnd_rd_rnaumap_plot_png: type: File? @@ -388,10 +388,10 @@ outputs: doc: | UMAP, colored by pseudotime, split by grouping condition, RNA - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by pseudotime, split by grouping condition, RNA' + tab: "Per group" + Caption: "UMAP, colored by pseudotime, split by grouping condition, RNA" umap_spl_cnd_rd_atacumap_plot_png: type: File? @@ -400,10 +400,10 @@ outputs: doc: | UMAP, colored by pseudotime, split by grouping condition, ATAC - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by pseudotime, split by grouping condition, ATAC' + tab: "Per group" + Caption: "UMAP, colored by pseudotime, split by grouping condition, ATAC" umap_spl_cnd_rd_wnnumap_plot_png: type: File? @@ -412,25 +412,24 @@ outputs: doc: | UMAP, colored by pseudotime, split by grouping condition, WNN - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by pseudotime, split by grouping condition, WNN' + tab: "Per group" + Caption: "UMAP, colored by pseudotime, split by grouping condition, WNN" ucsc_cb_html_data: type: Directory? outputSource: rna_trajectory/ucsc_cb_html_data - label: "UCSC Cell Browser data" + label: "UCSC Cell Browser (data)" doc: | - Directory with UCSC Cell Browser - data + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputSource: rna_trajectory/ucsc_cb_html_file label: "UCSC Cell Browser" doc: | - UCSC Cell Browser HTML index file + UCSC Cell Browser html index. "sd:visualPlugins": - linkList: tab: "Overview" @@ -439,31 +438,31 @@ outputs: seurat_data_rds: type: File outputSource: rna_trajectory/seurat_data_rds - label: "Processed Seurat data in RDS format" + label: "Seurat object in RDS format" doc: | - Processed Seurat data in RDS format + Seurat object. + RDS format. pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder - label: "Plots in PDF format" + label: "Compressed folder with all PDF plots" doc: | - Compressed folder with plots - in PDF format + Compressed folder with all PDF plots. rna_trajectory_stdout_log: type: File outputSource: rna_trajectory/stdout_log - label: "stdout log generated by rna_trajectory step" + label: "Output log" doc: | - stdout log generated by rna_trajectory step + Stdout log from the rna_trajectory step. rna_trajectory_stderr_log: type: File outputSource: rna_trajectory/stderr_log - label: "stderr log generated by rna_trajectory step" + label: "Error log" doc: | - stderr log generated by rna_trajectory step + Stderr log from the rna_trajectory step. steps: @@ -600,7 +599,7 @@ $schemas: label: "Single-Cell RNA-Seq Trajectory Analysis" s:name: "Single-Cell RNA-Seq Trajectory Analysis" -s:alternateName: "Infers developmental trajectories and pseudotime from cells clustered by similarity of gene expression data" +s:alternateName: "Aligns cells along the trajectory defined based on PCA or other dimensionality reduction" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-trajectory.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium diff --git a/workflows/sc-triangulate.cwl b/workflows/sc-triangulate.cwl index bc62c44b..f46ab915 100644 --- a/workflows/sc-triangulate.cwl +++ b/workflows/sc-triangulate.cwl @@ -23,7 +23,7 @@ requirements: }; -'sd:upstream': +"sd:upstream": sc_tools_sample: - "sc-rna-cluster.cwl" - "sc-atac-cluster.cwl" @@ -46,8 +46,8 @@ inputs: genes expression and/or chromatin accessibility information stored in the RNA and/or ATAC assays correspondingly. Additionally, 'rnaumap', and/or 'atacumap', and/or 'wnnumap' dimensionality reductions should be present. - 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" - 'sd:localLabel': true + "sd:upstreamSource": "sc_tools_sample/seurat_data_rds" + "sd:localLabel": true query_reduction: type: @@ -99,7 +99,7 @@ inputs: Color theme for all generated plots. One of gray, bw, linedraw, light, dark, minimal, classic, void. Default: classic - 'sd:layout': + "sd:layout": advanced: true threads: @@ -113,14 +113,14 @@ inputs: - "4" - "5" - "6" - default: "1" + default: "6" label: "Number of cores/cpus to use" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 1 - 'sd:layout': + Default: 6 + "sd:layout": advanced: true @@ -133,10 +133,10 @@ outputs: doc: | Cells UMAP with integrated labels (rnaumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'RNA' - Caption: 'Cells UMAP with integrated labels' + tab: "RNA" + Caption: "Cells UMAP with integrated labels" umap_tric_rd_rnaumap_plot_png: type: File? @@ -145,10 +145,10 @@ outputs: doc: | Cells UMAP with integration confidence scores (rnaumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'RNA' - Caption: 'Cells UMAP with integration confidence scores' + tab: "RNA" + Caption: "Cells UMAP with integration confidence scores" umap_tria_rd_rnaumap_plot_png: type: File? @@ -157,10 +157,10 @@ outputs: doc: | Cells UMAP with winning annotations (rnaumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'RNA' - Caption: 'Cells UMAP with winning annotations' + tab: "RNA" + Caption: "Cells UMAP with winning annotations" umap_tril_rd_atacumap_plot_png: type: File? @@ -169,10 +169,10 @@ outputs: doc: | Cells UMAP with integrated labels (atacumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'ATAC' - Caption: 'Cells UMAP with integrated labels' + tab: "ATAC" + Caption: "Cells UMAP with integrated labels" umap_tric_rd_atacumap_plot_png: type: File? @@ -181,10 +181,10 @@ outputs: doc: | Cells UMAP with integration confidence scores (atacumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'ATAC' - Caption: 'Cells UMAP with integration confidence scores' + tab: "ATAC" + Caption: "Cells UMAP with integration confidence scores" umap_tria_rd_atacumap_plot_png: type: File? @@ -193,10 +193,10 @@ outputs: doc: | Cells UMAP with winning annotations (atacumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'ATAC' - Caption: 'Cells UMAP with winning annotations' + tab: "ATAC" + Caption: "Cells UMAP with winning annotations" umap_tril_rd_wnnumap_plot_png: type: File? @@ -205,10 +205,10 @@ outputs: doc: | Cells UMAP with integrated labels (wnnumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'WNN' - Caption: 'Cells UMAP with integrated labels' + tab: "WNN" + Caption: "Cells UMAP with integrated labels" umap_tric_rd_wnnumap_plot_png: type: File? @@ -217,10 +217,10 @@ outputs: doc: | Cells UMAP with integration confidence scores (wnnumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'WNN' - Caption: 'Cells UMAP with integration confidence scores' + tab: "WNN" + Caption: "Cells UMAP with integration confidence scores" umap_tria_rd_wnnumap_plot_png: type: File? @@ -229,57 +229,57 @@ outputs: doc: | Cells UMAP with winning annotations (wnnumap dim. reduction). PNG format - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'WNN' - Caption: 'Cells UMAP with winning annotations' + tab: "WNN" + Caption: "Cells UMAP with winning annotations" ucsc_cb_html_data: type: Directory outputSource: triangulate/ucsc_cb_html_data - label: "Directory with UCSC Cellbrowser html data" + label: "UCSC Cell Browser (data)" doc: | - Directory with UCSC Cellbrowser html data. + UCSC Cell Browser html data. ucsc_cb_html_file: type: File outputSource: triangulate/ucsc_cb_html_file - label: "Open in UCSC Cell Browser" + label: "UCSC Cell Browser" doc: | - HTML index file from the directory with UCSC Cellbrowser html data. - 'sd:visualPlugins': + UCSC Cell Browser html index. + "sd:visualPlugins": - linkList: - tab: 'Overview' + tab: "Overview" target: "_blank" seurat_data_rds: type: File outputSource: triangulate/seurat_data_rds - label: "Processed Seurat data in RDS format" + label: "Seurat object in RDS format" doc: | - Processed Seurat data in RDS format + Seurat object. + RDS format. pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder - label: "Plots in PDF format" + label: "Compressed folder with all PDF plots" doc: | - Compressed folder with plots - in PDF format + Compressed folder with all PDF plots. triangulate_stdout_log: type: File outputSource: triangulate/stdout_log - label: "stdout log generated by triangulate step" + label: "Output log" doc: | - stdout log generated by triangulate step + Stdout log from the triangulate step. triangulate_stderr_log: type: File outputSource: triangulate/stderr_log - label: "stderr log generated by triangulate step" + label: "Error log" doc: | - stderr log generated by triangulate step + Stderr log from the triangulate step. steps: @@ -365,8 +365,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Single-cell Label Integration Analysis" -s:name: "Single-cell Label Integration Analysis" +label: "Single-Cell Label Integration Analysis" +s:name: "Single-Cell Label Integration Analysis" s:alternateName: "Harmonizes conflicting annotations in single-cell genomics studies" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-triangulate.cwl @@ -405,6 +405,6 @@ s:creator: doc: | - Single-cell Label Integration Analysis + Single-Cell Label Integration Analysis Harmonizes conflicting annotations in single-cell genomics studies. \ No newline at end of file diff --git a/workflows/sc-vdj-profile.cwl b/workflows/sc-vdj-profile.cwl index efb4c97b..57ae8e1e 100644 --- a/workflows/sc-vdj-profile.cwl +++ b/workflows/sc-vdj-profile.cwl @@ -9,7 +9,7 @@ requirements: - class: InlineJavascriptRequirement -'sd:upstream': +"sd:upstream": sc_tools_sample: - "sc-rna-cluster.cwl" - "sc-ctype-assign.cwl" @@ -36,8 +36,8 @@ inputs: Assignment" or "Single-Cell RNA-Seq Cluster Analysis" at any of the processing stages. - 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" - 'sd:localLabel': true + "sd:upstreamSource": "sc_tools_sample/seurat_data_rds" + "sd:localLabel": true contigs_data: type: File @@ -50,8 +50,8 @@ inputs: can be obtained from either "Cell Ranger Count (RNA+VDJ)" or "Cell Ranger Aggregate (RNA, RNA+VDJ)" pipeline. - 'sd:upstreamSource': "sc_vdj_sample/filtered_contig_annotations_csv" - 'sd:localLabel': true + "sd:upstreamSource": "sc_vdj_sample/filtered_contig_annotations_csv" + "sd:localLabel": true query_source_column: type: string @@ -172,13 +172,13 @@ inputs: - "4" - "5" - "6" - default: "1" + default: "6" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 1 + Default: 6 "sd:layout": advanced: true @@ -192,10 +192,10 @@ outputs: doc: | Unique clonotypes, split by dataset - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Unique clonotypes, split by dataset' + tab: "Per dataset" + Caption: "Unique clonotypes, split by dataset" hmst_spl_idnt_plot_png: type: File? @@ -204,10 +204,10 @@ outputs: doc: | Clonal space homeostasis, split by dataset - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Clonal space homeostasis, split by dataset' + tab: "Per dataset" + Caption: "Clonal space homeostasis, split by dataset" vrlp_spl_idnt_plot_png: type: File? @@ -216,10 +216,10 @@ outputs: doc: | Clonotypes similarity, split by dataset - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Clonotypes similarity, split by dataset' + tab: "Per dataset" + Caption: "Clonotypes similarity, split by dataset" ntwr_gr_idnt_plot_png: type: File? @@ -228,10 +228,10 @@ outputs: doc: | Clonotypes network, colored by dataset - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Clonotypes network, colored by dataset' + tab: "Per dataset" + Caption: "Clonotypes network, colored by dataset" dvrs_gr_clst_spl_idnt_plot_png: type: File? @@ -241,10 +241,10 @@ outputs: Clonotypes diversity, colored by cluster, split by dataset - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Clonotypes diversity, colored by cluster, split by dataset' + tab: "Per dataset" + Caption: "Clonotypes diversity, colored by cluster, split by dataset" chrd_gr_idnt_plot_png: type: File? @@ -253,10 +253,10 @@ outputs: doc: | Shared clonotype, colored by dataset - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Shared clonotype, colored by dataset' + tab: "Per dataset" + Caption: "Shared clonotype, colored by dataset" gene_spl_idnt_vdjc_plot_png: type: @@ -268,10 +268,10 @@ outputs: doc: | Relative usage of V, D, J, C genes, split by dataset - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Relative usage of V, D, J, C genes, split by dataset' + tab: "Per dataset" + Caption: "Relative usage of V, D, J, C genes, split by dataset" count_spl_clst_plot_png: type: File? @@ -280,10 +280,10 @@ outputs: doc: | Unique clonotypes, split by cluster - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per cluster' - Caption: 'Unique clonotypes, split by cluster' + tab: "Per cluster" + Caption: "Unique clonotypes, split by cluster" hmst_spl_clst_plot_png: type: File? @@ -292,10 +292,10 @@ outputs: doc: | Clonal space homeostasis, split by cluster - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per cluster' - Caption: 'Clonal space homeostasis, split by cluster' + tab: "Per cluster" + Caption: "Clonal space homeostasis, split by cluster" vrlp_spl_clst_plot_png: type: File? @@ -304,10 +304,10 @@ outputs: doc: | Clonotypes similarity, split by cluster - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per cluster' - Caption: 'Clonotypes similarity, split by cluster' + tab: "Per cluster" + Caption: "Clonotypes similarity, split by cluster" ntwr_gr_clst_plot_png: type: File? @@ -316,10 +316,10 @@ outputs: doc: | Clonotypes network, colored by cluster - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per cluster' - Caption: 'Clonotypes network, colored by cluster' + tab: "Per cluster" + Caption: "Clonotypes network, colored by cluster" dvrs_gr_idnt_spl_clst_plot_png: type: File? @@ -329,10 +329,10 @@ outputs: Clonotypes diversity, colored by dataset, split by cluster - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per cluster' - Caption: 'Clonotypes diversity, colored by dataset, split by cluster' + tab: "Per cluster" + Caption: "Clonotypes diversity, colored by dataset, split by cluster" chrd_gr_clst_plot_png: type: File? @@ -341,10 +341,10 @@ outputs: doc: | Shared clonotype, colored by cluster - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per cluster' - Caption: 'Shared clonotype, colored by cluster' + tab: "Per cluster" + Caption: "Shared clonotype, colored by cluster" gene_spl_clst_vdjc_plot_png: type: @@ -356,10 +356,10 @@ outputs: doc: | Relative usage of V, D, J, C genes, split by cluster - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per cluster' - Caption: 'Relative usage of V, D, J, C genes, split by cluster' + tab: "Per cluster" + Caption: "Relative usage of V, D, J, C genes, split by cluster" count_spl_cnd_plot_png: type: File? @@ -369,10 +369,10 @@ outputs: Unique clonotypes, split by grouping condition - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'Unique clonotypes, split by grouping condition' + tab: "Per group" + Caption: "Unique clonotypes, split by grouping condition" hmst_spl_cnd_plot_png: type: File? @@ -381,10 +381,10 @@ outputs: doc: | Clonal space homeostasis, split by grouping condition - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'Clonal space homeostasis, split by grouping condition' + tab: "Per group" + Caption: "Clonal space homeostasis, split by grouping condition" vrlp_spl_cnd_plot_png: type: File? @@ -393,10 +393,10 @@ outputs: doc: | Clonotypes similarity, split by grouping condition - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'Clonotypes similarity, split by grouping condition' + tab: "Per group" + Caption: "Clonotypes similarity, split by grouping condition" ntwr_gr_cnd_plot_png: type: File? @@ -405,10 +405,10 @@ outputs: doc: | Clonotypes network, colored by grouping condition - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'Clonotypes network, colored by grouping condition' + tab: "Per group" + Caption: "Clonotypes network, colored by grouping condition" dvrs_gr_clst_spl_cnd_plot_png: type: File? @@ -418,10 +418,10 @@ outputs: Clonotypes diversity, colored by cluster, split by grouping condition - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'Clonotypes diversity, colored by cluster, split by grouping condition' + tab: "Per group" + Caption: "Clonotypes diversity, colored by cluster, split by grouping condition" dvrs_gr_cnd_spl_clst_plot_png: type: File? @@ -431,10 +431,10 @@ outputs: Clonotypes diversity, colored by grouping condition, split by cluster - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'Clonotypes diversity, colored by grouping condition, split by cluster' + tab: "Per group" + Caption: "Clonotypes diversity, colored by grouping condition, split by cluster" chrd_gr_cnd_plot_png: type: File? @@ -444,25 +444,24 @@ outputs: Shared clonotype, colored by grouping condition - 'sd:visualPlugins': + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'Shared clonotype, colored by grouping condition' + tab: "Per group" + Caption: "Shared clonotype, colored by grouping condition" ucsc_cb_html_data: type: Directory? outputSource: vdj_profile/ucsc_cb_html_data - label: "UCSC Cell Browser data" + label: "UCSC Cell Browser (data)" doc: | - Directory with UCSC Cell Browser - data + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputSource: vdj_profile/ucsc_cb_html_file label: "UCSC Cell Browser" doc: | - UCSC Cell Browser HTML index file + UCSC Cell Browser html index. "sd:visualPlugins": - linkList: tab: "Overview" @@ -471,38 +470,40 @@ outputs: seurat_data_rds: type: File outputSource: vdj_profile/seurat_data_rds - label: "Processed Seurat data in RDS format" + label: "Seurat object in RDS format" doc: | - Processed Seurat data in RDS format + Seurat object. + RDS format. seurat_data_scope: type: File? outputSource: vdj_profile/seurat_data_scope - label: "Processed Seurat data in SCope compatible loom format" + label: "Seurat object in SCope compatible loom format" doc: | - Processed Seurat data in SCope compatible loom format + Seurat object. + SCope compatible. + Loom format. pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder - label: "Plots in PDF format" + label: "Compressed folder with all PDF plots" doc: | - Compressed folder with plots - in PDF format + Compressed folder with all PDF plots. vdj_profile_stdout_log: type: File outputSource: vdj_profile/stdout_log - label: "stdout log generated by vdj_profile step" + label: "Output log" doc: | - stdout log generated by vdj_profile step + Stdout log from the vdj_profile step. vdj_profile_stderr_log: type: File outputSource: vdj_profile/stderr_log - label: "stderr log generated by vdj_profile step" + label: "Error log" doc: | - stderr log generated by vdj_profile step + Stderr log from the vdj_profile step. steps: @@ -635,7 +636,7 @@ $schemas: label: "Single-Cell Immune Profiling Analysis" s:name: "Single-Cell Immune Profiling Analysis" -s:alternateName: "Estimates clonotype diversity and dynamics from V(D)J sequencing data assembled into contigs" +s:alternateName: "TCR/BCR clonotype dynamics analysis" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-vdj-profile.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index 88c37752..d18b23a2 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -21,7 +21,7 @@ requirements: }; -'sd:upstream': +"sd:upstream": sc_tools_sample: - "sc-wnn-cluster.cwl" - "sc-rna-cluster.cwl" @@ -52,8 +52,8 @@ inputs: Analysis" and "Single-Cell ATAC-Seq Dimensionality Reduction Analysis" at any of the processing stages. - 'sd:upstreamSource': "sc_tools_sample/seurat_data_rds" - 'sd:localLabel': true + "sd:upstreamSource": "sc_tools_sample/seurat_data_rds" + "sd:localLabel": true atac_fragments_file: type: File? @@ -67,8 +67,8 @@ inputs: sample can be obtained from either "Cell Ranger Count (RNA+ATAC)" or "Cell Ranger Aggregate (RNA+ATAC)" pipeline - 'sd:upstreamSource': "sc_arc_sample/atac_fragments_file" - 'sd:localLabel': true + "sd:upstreamSource": "sc_arc_sample/atac_fragments_file" + "sd:localLabel": true rna_dimensions: type: int? @@ -189,78 +189,109 @@ inputs: - "4" - "5" - "6" - default: "1" + default: "6" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 1 + Default: 6 "sd:layout": advanced: true outputs: - umap_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_wnn_cluster/umap_res_plot_png - label: "UMAP, colored by cluster" + umap_gr_ph_spl_idnt_plot_png: + type: File? + outputSource: sc_wnn_cluster/umap_gr_ph_spl_idnt_plot_png + label: "UMAP colored by cell cycle phase (split by dataset, downsampled)" doc: | - UMAP, colored by cluster - 'sd:visualPlugins': + UMAP colored by cell cycle phase. + Split by dataset; downsampled to the + smallest dataset. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per cluster' - Caption: 'UMAP, colored by cluster' + tab: "Per dataset" + Caption: "UMAP colored by cell cycle phase (split by dataset, downsampled)" - umap_spl_ph_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_wnn_cluster/umap_spl_ph_res_plot_png - label: "UMAP, colored by cluster, split by cell cycle phase" + cmp_gr_ph_spl_idnt_plot_png: + type: File? + outputSource: sc_wnn_cluster/cmp_gr_ph_spl_idnt_plot_png + label: "Composition plot colored by cell cycle phase (split by dataset, downsampled)" doc: | - UMAP, colored by cluster, - split by cell cycle phase - 'sd:visualPlugins': + Composition plot colored by cell cycle phase. + Split by dataset; downsampled to the smallest + dataset. + PNG format + "sd:visualPlugins": - image: - tab: 'Per cluster' - Caption: 'UMAP, colored by cluster, split by cell cycle phase' + tab: "Per dataset" + Caption: "Composition plot colored by cell cycle phase (split by dataset, downsampled)" - cmp_gr_ph_spl_clst_res_plot_png: + umap_gr_ph_spl_cnd_plot_png: + type: File? + outputSource: sc_wnn_cluster/umap_gr_ph_spl_cnd_plot_png + label: "UMAP colored by cell cycle phase (split by grouping condition, downsampled)" + doc: | + UMAP colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": + - image: + tab: "Per group" + Caption: "UMAP colored by cell cycle phase (split by grouping condition, downsampled)" + + cmp_gr_ph_spl_cnd_plot_png: + type: File? + outputSource: sc_wnn_cluster/cmp_gr_ph_spl_cnd_plot_png + label: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" + doc: | + Composition plot colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": + - image: + tab: "Per group" + Caption: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" + + umap_gr_clst_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_wnn_cluster/cmp_gr_ph_spl_clst_res_plot_png - label: "Composition plot, colored by cell cycle phase, split by cluster, downsampled" + outputSource: sc_wnn_cluster/umap_gr_clst_res_plot_png + label: "UMAP colored by cluster (all cells)" doc: | - Composition plot, colored by - cell cycle phase, split by - cluster, downsampled - 'sd:visualPlugins': + UMAP colored by cluster. + All cells. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per cluster' - Caption: 'Composition plot, colored by cell cycle phase, split by cluster, downsampled' + tab: "Per cluster" + Caption: "UMAP colored by cluster (all cells)" - umap_spl_idnt_res_plot_png: + umap_gr_clst_spl_idnt_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_wnn_cluster/umap_spl_idnt_res_plot_png - label: "UMAP, colored by cluster, split by dataset" + outputSource: sc_wnn_cluster/umap_gr_clst_spl_idnt_res_plot_png + label: "UMAP colored by cluster (split by dataset, downsampled)" doc: | - UMAP, colored by cluster, - split by dataset - 'sd:visualPlugins': + UMAP colored by cluster. + Split by dataset; downsampled + to the smallest dataset. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'UMAP, colored by cluster, split by dataset' + tab: "Per dataset" + Caption: "UMAP colored by cluster (split by dataset, downsampled)" cmp_gr_clst_spl_idnt_res_plot_png: type: @@ -268,15 +299,16 @@ outputs: - type: array items: File outputSource: sc_wnn_cluster/cmp_gr_clst_spl_idnt_res_plot_png - label: "Composition plot, colored by cluster, split by dataset, downsampled" + label: "Composition plot colored by cluster (split by dataset, downsampled)" doc: | - Composition plot, colored by - cluster, split by dataset, - downsampled - 'sd:visualPlugins': + Composition plot colored by cluster. + Split by dataset; downsampled + to the smallest dataset. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Composition plot, colored by cluster, split by dataset, downsampled' + tab: "Per dataset" + Caption: "Composition plot colored by cluster (split by dataset, downsampled)" cmp_gr_idnt_spl_clst_res_plot_png: type: @@ -284,43 +316,70 @@ outputs: - type: array items: File outputSource: sc_wnn_cluster/cmp_gr_idnt_spl_clst_res_plot_png - label: "Composition plot, colored by dataset, split by cluster, downsampled" + label: "Composition plot colored by dataset (split by cluster, downsampled)" doc: | - Composition plot, colored by - dataset, split by cluster, - downsampled - 'sd:visualPlugins': + Composition plot colored by dataset. + Split by cluster; downsampled to the + smallest dataset. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Composition plot, colored by dataset, split by cluster, downsampled' + tab: "Per dataset" + Caption: "Composition plot colored by dataset (split by cluster, downsampled)" - cmp_gr_ph_spl_idnt_plot_png: - type: File? - outputSource: sc_wnn_cluster/cmp_gr_ph_spl_idnt_plot_png - label: "Composition plot, colored by cell cycle phase, split by dataset, downsampled" - doc: | - Composition plot, colored by - cell cycle phase, split by - dataset, downsampled - 'sd:visualPlugins': + umap_gr_clst_spl_ph_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/umap_gr_clst_spl_ph_res_plot_png + label: "UMAP colored by cluster (split by cell cycle phase, optionally downsampled)" + doc: | + UMAP colored by cluster. + Split by cell cycle phase; downsampled + to the smallest dataset (if multiple + datasets are analyzed jointly). + PNG format. + "sd:visualPlugins": - image: - tab: 'Per dataset' - Caption: 'Composition plot, colored by cell cycle phase, split by dataset, downsampled' + tab: "Per cluster" + Caption: "UMAP colored by cluster (split by cell cycle phase, optionally downsampled)" - umap_spl_cnd_res_plot_png: + cmp_gr_ph_spl_clst_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_wnn_cluster/umap_spl_cnd_res_plot_png - label: "UMAP, colored by cluster, split by grouping condition" + outputSource: sc_wnn_cluster/cmp_gr_ph_spl_clst_res_plot_png + label: "Composition plot colored by cell cycle phase (split by cluster, optionally downsampled)" doc: | - UMAP, colored by cluster, - split by grouping condition - 'sd:visualPlugins': + Composition plot colored by cell cycle phase. + Split by cluster; downsampled to the smallest + dataset (if multiple datasets are analyzed + jointly). + PNG format + "sd:visualPlugins": + - image: + tab: "Per cluster" + Caption: "Composition plot colored by cell cycle phase (split by cluster, optionally downsampled)" + + umap_gr_clst_spl_cnd_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/umap_gr_clst_spl_cnd_res_plot_png + label: "UMAP colored by cluster (split by grouping condition, downsampled)" + doc: | + UMAP colored by cluster. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'UMAP, colored by cluster, split by grouping condition' + tab: "Per group" + Caption: "UMAP colored by cluster (split by grouping condition, downsampled)" cmp_gr_clst_spl_cnd_res_plot_png: type: @@ -328,15 +387,17 @@ outputs: - type: array items: File outputSource: sc_wnn_cluster/cmp_gr_clst_spl_cnd_res_plot_png - label: "Composition plot, colored by cluster, split by grouping condition, downsampled" + label: "Composition plot colored by cluster (split by grouping condition, downsampled)" doc: | - Composition plot, colored by - cluster, split by grouping - condition, downsampled - 'sd:visualPlugins': + Composition plot colored by cluster. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'Composition plot, colored by cluster, split by grouping condition, downsampled' + tab: "Per group" + Caption: "Composition plot colored by cluster (split by grouping condition, downsampled)" cmp_gr_cnd_spl_clst_res_plot_png: type: @@ -344,57 +405,64 @@ outputs: - type: array items: File outputSource: sc_wnn_cluster/cmp_gr_cnd_spl_clst_res_plot_png - label: "Composition plot, colored by grouping condition, split by cluster, downsampled" + label: "Composition plot colored by grouping condition (split by cluster, downsampled)" doc: | - Composition plot, colored by - grouping condition, split by - cluster, downsampled - 'sd:visualPlugins': + Composition plot colored by grouping condition. + Split by cluster; first downsampled to the + smallest dataset, then downsampled to the + smallest group. + PNG format. + "sd:visualPlugins": - image: - tab: 'Per group' - Caption: 'Composition plot, colored by grouping condition, split by cluster, downsampled' + tab: "Per group" + Caption: "Composition plot colored by grouping condition (split by cluster, downsampled)" - xpr_avg_res_plot_png: + xpr_per_cell_plot_png: type: - "null" - type: array items: File - outputSource: sc_wnn_cluster/xpr_avg_res_plot_png - label: "Gene expression dot plot" + outputSource: sc_wnn_cluster/xpr_per_cell_plot_png + label: "UMAP colored by gene expression (per gene)" doc: | - Gene expression dot plot - 'sd:visualPlugins': + UMAP colored by gene expression. + All genes of interest. + PNG format. + "sd:visualPlugins": - image: - tab: 'Genes of interest' - Caption: 'Gene expression dot plot' + tab: "Gene expression" + Caption: "UMAP colored by gene expression (per gene)" - xpr_dnst_res_plot_png: + xpr_avg_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_wnn_cluster/xpr_dnst_res_plot_png - label: "Gene expression violin plot" + outputSource: sc_wnn_cluster/xpr_avg_res_plot_png + label: "Average gene expression" doc: | - Gene expression violin plot - 'sd:visualPlugins': + Average gene expression. + PNG format. + "sd:visualPlugins": - image: - tab: 'Genes of interest' - Caption: 'Gene expression violin plot' + tab: "Gene expression" + Caption: "Average gene expression" - xpr_per_cell_plot_png: + xpr_dnst_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_wnn_cluster/xpr_per_cell_plot_png - label: "UMAP, gene expression" + outputSource: sc_wnn_cluster/xpr_dnst_res_plot_png + label: "Gene expression density (per gene)" doc: | - UMAP, gene expression - 'sd:visualPlugins': + Gene expression density. + All genes of interest. + PNG format. + "sd:visualPlugins": - image: - tab: 'Genes of interest' - Caption: 'UMAP, gene expression' + tab: "Gene expression" + Caption: "Gene expression density (per gene)" xpr_htmp_res_plot_png: type: @@ -402,77 +470,81 @@ outputs: - type: array items: File outputSource: sc_wnn_cluster/xpr_htmp_res_plot_png - label: "Gene expression heatmap" + label: "Gene expression heatmap (top gene markers)" doc: | - Gene expression heatmap - 'sd:visualPlugins': + Gene expression heatmap. + Top gene markers. + PNG format. + "sd:visualPlugins": - image: - tab: 'Heatmap' - Caption: 'Gene expression heatmap' + tab: "Gene expression heatmap" + Caption: "Gene expression heatmap (top gene markers)" - cvrg_res_plot_png: + xpr_htmp_res_tsv: type: - "null" - type: array items: File - outputSource: sc_wnn_cluster/cvrg_res_plot_png - label: "ATAC fragments coverage" + outputSource: sc_wnn_cluster/xpr_htmp_res_tsv + label: "Gene expression heatmap (top gene markers)" doc: | - ATAC fragments coverage - 'sd:visualPlugins': - - image: - tab: 'Genome coverage' - Caption: 'ATAC fragments coverage' + Gene expression heatmap. + Top gene markers. + TSV format. - xpr_htmp_res_tsv: + cvrg_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_wnn_cluster/xpr_htmp_res_tsv - label: "Markers from gene expression heatmap" + outputSource: sc_wnn_cluster/cvrg_res_plot_png + label: "ATAC fragment coverage (per gene)" doc: | - Gene markers used for gene - expression heatmap + ATAC fragment coverage. + All genes of interest. + PNG format. + "sd:visualPlugins": + - image: + tab: "Genome coverage" + Caption: "ATAC fragment coverage (per gene)" gene_markers_tsv: type: File? outputSource: sc_wnn_cluster/gene_markers_tsv - label: "Gene markers per cluster for all resolutions" + label: "Gene markers" doc: | - Gene markers per cluster for - all resolutions - 'sd:visualPlugins': + Gene markers. + TSV format. + "sd:visualPlugins": - syncfusiongrid: - tab: 'Gene markers' - Title: 'Gene markers per cluster for all resolutions' + tab: "Gene markers" + Title: "Gene markers" peak_markers_tsv: type: File? outputSource: sc_wnn_cluster/peak_markers_tsv - label: "Peak markers per cluster for all resolutions" + label: "Peak markers" doc: | - Peak markers per cluster for - all resolutions - 'sd:visualPlugins': + Peak markers. + TSV format. + "sd:visualPlugins": - syncfusiongrid: - tab: 'Peak markers' - Title: 'Peak markers per cluster for all resolutions' + tab: "Peak markers" + Title: "Peak markers" ucsc_cb_html_data: type: Directory? outputSource: sc_wnn_cluster/ucsc_cb_html_data - label: "UCSC Cell Browser data" + label: "UCSC Cell Browser (data)" doc: | - Directory with UCSC Cell Browser - data + UCSC Cell Browser html data. ucsc_cb_html_file: type: File? outputSource: sc_wnn_cluster/ucsc_cb_html_file label: "UCSC Cell Browser" doc: | - UCSC Cell Browser HTML index file + UCSC Cell Browser html index. "sd:visualPlugins": - linkList: tab: "Overview" @@ -481,40 +553,40 @@ outputs: seurat_data_rds: type: File outputSource: sc_wnn_cluster/seurat_data_rds - label: "Processed Seurat data in RDS format" + label: "Seurat object in RDS format" doc: | - Processed Seurat data in RDS format + Seurat object. + RDS format. seurat_data_scope: type: File? outputSource: sc_wnn_cluster/seurat_data_scope - label: "Processed Seurat data in SCope compatible loom format" + label: "Seurat object in SCope compatible loom format" doc: | - Processed Seurat data in SCope compatible loom format. - Only not normalized raw counts from the RNA assay will - be saved + Seurat object. + SCope compatible. + Loom format. pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder - label: "Plots in PDF format" + label: "Compressed folder with all PDF plots" doc: | - Compressed folder with plots - in PDF format + Compressed folder with all PDF plots. sc_wnn_cluster_stdout_log: type: File outputSource: sc_wnn_cluster/stdout_log - label: "stdout log generated by sc_wnn_cluster step" + label: "Output log" doc: | - stdout log generated by sc_wnn_cluster step + Stdout log from the sc_wnn_cluster step. sc_wnn_cluster_stderr_log: type: File outputSource: sc_wnn_cluster/stderr_log - label: "stderr log generated by sc_wnn_cluster step" + label: "Error log" doc: | - stderr log generated by sc_wnn_cluster step + Stderr log from the sc_wnn_cluster step. steps: @@ -568,37 +640,43 @@ steps: source: threads valueFrom: $(parseInt(self)) out: - - umap_res_plot_png - - umap_spl_idnt_res_plot_png + - umap_gr_ph_spl_idnt_plot_png + - cmp_gr_ph_spl_idnt_plot_png + - umap_gr_ph_spl_cnd_plot_png + - cmp_gr_ph_spl_cnd_plot_png + - umap_gr_clst_res_plot_png + - umap_gr_clst_spl_idnt_res_plot_png - cmp_gr_clst_spl_idnt_res_plot_png - cmp_gr_idnt_spl_clst_res_plot_png - - umap_spl_cnd_res_plot_png + - umap_gr_clst_spl_ph_res_plot_png + - cmp_gr_ph_spl_clst_res_plot_png + - umap_gr_clst_spl_cnd_res_plot_png - cmp_gr_clst_spl_cnd_res_plot_png - cmp_gr_cnd_spl_clst_res_plot_png - - umap_spl_ph_res_plot_png - - cmp_gr_ph_spl_idnt_plot_png - - cmp_gr_ph_spl_clst_res_plot_png - - xpr_avg_res_plot_png - xpr_per_cell_plot_png + - xpr_avg_res_plot_png - xpr_dnst_res_plot_png - - cvrg_res_plot_png - xpr_htmp_res_plot_png - - umap_res_plot_pdf - - umap_spl_idnt_res_plot_pdf + - cvrg_res_plot_png + - umap_gr_ph_spl_idnt_plot_pdf + - cmp_gr_ph_spl_idnt_plot_pdf + - umap_gr_ph_spl_cnd_plot_pdf + - cmp_gr_ph_spl_cnd_plot_pdf + - umap_gr_clst_res_plot_pdf + - umap_gr_clst_spl_idnt_res_plot_pdf - cmp_gr_clst_spl_idnt_res_plot_pdf - cmp_gr_idnt_spl_clst_res_plot_pdf - - umap_spl_cnd_res_plot_pdf + - umap_gr_clst_spl_ph_res_plot_pdf + - cmp_gr_ph_spl_clst_res_plot_pdf + - umap_gr_clst_spl_cnd_res_plot_pdf - cmp_gr_clst_spl_cnd_res_plot_pdf - cmp_gr_cnd_spl_clst_res_plot_pdf - - umap_spl_ph_res_plot_pdf - - cmp_gr_ph_spl_idnt_plot_pdf - - cmp_gr_ph_spl_clst_res_plot_pdf - - xpr_avg_res_plot_pdf - xpr_per_cell_plot_pdf - xpr_per_cell_sgnl_plot_pdf + - xpr_avg_res_plot_pdf - xpr_dnst_res_plot_pdf - - cvrg_res_plot_pdf - xpr_htmp_res_plot_pdf + - cvrg_res_plot_pdf - xpr_htmp_res_tsv - gene_markers_tsv - peak_markers_tsv @@ -614,22 +692,25 @@ steps: in: input_files: source: - - sc_wnn_cluster/umap_res_plot_pdf - - sc_wnn_cluster/umap_spl_idnt_res_plot_pdf + - sc_wnn_cluster/umap_gr_ph_spl_idnt_plot_pdf + - sc_wnn_cluster/cmp_gr_ph_spl_idnt_plot_pdf + - sc_wnn_cluster/umap_gr_ph_spl_cnd_plot_pdf + - sc_wnn_cluster/cmp_gr_ph_spl_cnd_plot_pdf + - sc_wnn_cluster/umap_gr_clst_res_plot_pdf + - sc_wnn_cluster/umap_gr_clst_spl_idnt_res_plot_pdf - sc_wnn_cluster/cmp_gr_clst_spl_idnt_res_plot_pdf - sc_wnn_cluster/cmp_gr_idnt_spl_clst_res_plot_pdf - - sc_wnn_cluster/umap_spl_cnd_res_plot_pdf + - sc_wnn_cluster/umap_gr_clst_spl_ph_res_plot_pdf + - sc_wnn_cluster/cmp_gr_ph_spl_clst_res_plot_pdf + - sc_wnn_cluster/umap_gr_clst_spl_cnd_res_plot_pdf - sc_wnn_cluster/cmp_gr_clst_spl_cnd_res_plot_pdf - sc_wnn_cluster/cmp_gr_cnd_spl_clst_res_plot_pdf - - sc_wnn_cluster/umap_spl_ph_res_plot_pdf - - sc_wnn_cluster/cmp_gr_ph_spl_idnt_plot_pdf - - sc_wnn_cluster/cmp_gr_ph_spl_clst_res_plot_pdf - - sc_wnn_cluster/xpr_avg_res_plot_pdf - sc_wnn_cluster/xpr_per_cell_plot_pdf - sc_wnn_cluster/xpr_per_cell_sgnl_plot_pdf + - sc_wnn_cluster/xpr_avg_res_plot_pdf - sc_wnn_cluster/xpr_dnst_res_plot_pdf - - sc_wnn_cluster/cvrg_res_plot_pdf - sc_wnn_cluster/xpr_htmp_res_plot_pdf + - sc_wnn_cluster/cvrg_res_plot_pdf valueFrom: $(self.flat().filter(n => n)) folder_basename: default: "pdf_plots" @@ -652,7 +733,7 @@ $schemas: label: "Single-Cell WNN Cluster Analysis" s:name: "Single-Cell WNN Cluster Analysis" -s:alternateName: "Clusters cells by similarity based on both gene expression and chromatin accessibility data" +s:alternateName: "Clusters multiome ATAC and RNA-Seq datasets, identifies gene markers and differentially accessible peaks" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-wnn-cluster.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium diff --git a/workflows/single-cell-preprocess-cellranger.cwl b/workflows/single-cell-preprocess-cellranger.cwl index de0af624..488eb1ef 100644 --- a/workflows/single-cell-preprocess-cellranger.cwl +++ b/workflows/single-cell-preprocess-cellranger.cwl @@ -92,13 +92,13 @@ inputs: - "4" - "5" - "6" - default: "4" + default: "6" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 4 + Default: 6 "sd:layout": advanced: true From 50dfed9cdf8d6764c6b7b3a27e5b73ca2be265e7 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 12 Mar 2024 01:25:41 -0400 Subject: [PATCH 123/162] Update glob field in the sc tools gene expression plots --- tools/sc-ctype-assign.cwl | 4 ++-- tools/sc-rna-cluster.cwl | 4 ++-- tools/sc-wnn-cluster.cwl | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index 0e193174..f1778d6c 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -635,7 +635,7 @@ outputs: - type: array items: File outputBinding: - glob: "*_xpr_per_cell_*.png" + glob: "*_xpr_per_cell_[!sgnl_]*.png" doc: | UMAP colored by gene expression. All genes of interest. @@ -647,7 +647,7 @@ outputs: - type: array items: File outputBinding: - glob: "*_xpr_per_cell_*.pdf" + glob: "*_xpr_per_cell_[!sgnl_]*.pdf" doc: | UMAP colored by gene expression. All genes of interest. diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index e7b96262..9186f47b 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -618,7 +618,7 @@ outputs: - type: array items: File outputBinding: - glob: "*_xpr_per_cell_*.png" + glob: "*_xpr_per_cell_[!sgnl_]*.png" doc: | UMAP colored by gene expression. All genes of interest. @@ -630,7 +630,7 @@ outputs: - type: array items: File outputBinding: - glob: "*_xpr_per_cell_*.pdf" + glob: "*_xpr_per_cell_[!sgnl_]*.pdf" doc: | UMAP colored by gene expression. All genes of interest. diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index c63f975b..112c61b6 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -751,7 +751,7 @@ outputs: - type: array items: File outputBinding: - glob: "*_xpr_per_cell_*.png" + glob: "*_xpr_per_cell_[!sgnl_]*.png" doc: | UMAP colored by gene expression. All genes of interest. @@ -763,7 +763,7 @@ outputs: - type: array items: File outputBinding: - glob: "*_xpr_per_cell_*.pdf" + glob: "*_xpr_per_cell_[!sgnl_]*.pdf" doc: | UMAP colored by gene expression. All genes of interest. From fdccee1751dc9e1fe9da87017815be95a8b47715 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 18 Mar 2024 14:05:53 -0400 Subject: [PATCH 124/162] Make clustering pipelines outputs pretty --- workflows/sc-atac-cluster.cwl | 67 ++----- workflows/sc-ctype-assign.cwl | 198 ++++++++----------- workflows/sc-rna-cluster.cwl | 301 +++++++++++++---------------- workflows/sc-wnn-cluster.cwl | 353 +++++++++++++++------------------- 4 files changed, 393 insertions(+), 526 deletions(-) diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index 8be1143c..2ba9fc44 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -187,7 +187,7 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per cluster" + tab: "Split by cluster" Caption: "UMAP colored by cluster (all cells)" slh_gr_clst_res_plot_png: @@ -203,7 +203,7 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per cluster" + tab: "Split by cluster" Caption: "Silhouette scores (all cells)" umap_gr_clst_spl_idnt_res_plot_png: @@ -220,7 +220,7 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per dataset" + tab: "Split by dataset" Caption: "UMAP colored by cluster (split by dataset, downsampled)" cmp_gr_clst_spl_idnt_res_plot_png: @@ -237,26 +237,9 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per dataset" + tab: "Split by dataset" Caption: "Composition plot colored by cluster (split by dataset, downsampled)" - cmp_gr_idnt_spl_clst_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_atac_cluster/cmp_gr_idnt_spl_clst_res_plot_png - label: "Composition plot colored by dataset (split by cluster, downsampled)" - doc: | - Composition plot colored by dataset. - Split by cluster; downsampled to the - smallest dataset. - PNG format. - "sd:visualPlugins": - - image: - tab: "Per dataset" - Caption: "Composition plot colored by dataset (split by cluster, downsampled)" - umap_gr_clst_spl_cnd_res_plot_png: type: - "null" @@ -272,7 +255,7 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per group" + tab: "Split by group" Caption: "UMAP colored by cluster (split by grouping condition, downsampled)" cmp_gr_clst_spl_cnd_res_plot_png: @@ -290,27 +273,9 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per group" + tab: "Split by group" Caption: "Composition plot colored by cluster (split by grouping condition, downsampled)" - cmp_gr_cnd_spl_clst_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_atac_cluster/cmp_gr_cnd_spl_clst_res_plot_png - label: "Composition plot colored by grouping condition (split by cluster, downsampled)" - doc: | - Composition plot colored by grouping condition. - Split by cluster; first downsampled to the - smallest dataset, then downsampled to the - smallest group. - PNG format. - "sd:visualPlugins": - - image: - tab: "Per group" - Caption: "Composition plot colored by grouping condition (split by cluster, downsampled)" - cvrg_res_plot_png: type: - "null" @@ -324,7 +289,7 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Genome coverage" + tab: "Genes of interest (coverage plot)" Caption: "ATAC fragment coverage (per gene)" peak_markers_tsv: @@ -336,7 +301,7 @@ outputs: TSV format. "sd:visualPlugins": - syncfusiongrid: - tab: "Peak markers" + tab: "Peak markers table" Title: "Peak markers" ucsc_cb_html_data: @@ -432,10 +397,8 @@ steps: - slh_gr_clst_res_plot_png - umap_gr_clst_spl_idnt_res_plot_png - cmp_gr_clst_spl_idnt_res_plot_png - - cmp_gr_idnt_spl_clst_res_plot_png - umap_gr_clst_spl_cnd_res_plot_png - cmp_gr_clst_spl_cnd_res_plot_png - - cmp_gr_cnd_spl_clst_res_plot_png - cvrg_res_plot_png - umap_gr_clst_res_plot_pdf - slh_gr_clst_res_plot_pdf @@ -489,7 +452,7 @@ $schemas: label: "Single-Cell ATAC-Seq Cluster Analysis" s:name: "Single-Cell ATAC-Seq Cluster Analysis" -s:alternateName: "Clusters cells by similarity of chromatin accessibility data" +s:alternateName: "Single-Cell ATAC-Seq Cluster Analysis" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-atac-cluster.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -529,8 +492,10 @@ s:creator: doc: | Single-Cell ATAC-Seq Cluster Analysis - Clusters cells by similarity of chromatin accessibility data - from the outputs of “Single-Cell ATAC-Seq Dimensionality - Reduction Analysis” pipeline. The results of this workflow are - primarily used in “Single-Cell Manual Cell Type Assignment” - pipeline. \ No newline at end of file + Clusters cells by similarity of chromatin accessibility + data from the outputs of the “Single-Cell ATAC-Seq + Dimensionality Reduction Analysis” pipeline. The results + of this workflow are used in the “Single-Cell Manual Cell + Type Assignment”, “Single-Cell ATAC-Seq Differential + Accessibility Analysis”, and “Single-Cell ATAC-Seq Genome + Coverage” pipelines. \ No newline at end of file diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index 1ceeff57..dc87ed2b 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -232,9 +232,39 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per cell type" + tab: "Split by cell type" Caption: "UMAP colored by cell type (all cells)" + umap_gr_ctyp_spl_ph_png: + type: File? + outputSource: ctype_assign/umap_gr_ctyp_spl_ph_png + label: "UMAP colored by cell type (split by cell cycle phase, optionally downsampled)" + doc: | + UMAP colored by cell type. + Split by cell cycle phase; downsampled + to the smallest dataset (if multiple + datasets are analyzed jointly). + PNG format. + "sd:visualPlugins": + - image: + tab: "Split by cell type" + Caption: "UMAP colored by cell type (split by cell cycle phase, optionally downsampled)" + + cmp_gr_ph_spl_ctyp_png: + type: File? + outputSource: ctype_assign/cmp_gr_ph_spl_ctyp_png + label: "Composition plot colored by cell cycle phase (split by cell type, optionally downsampled)" + doc: | + Composition plot colored by cell cycle phase. + Split by cell type; downsampled to the + smallest dataset (if multiple datasets are + analyzed jointly). + PNG format. + "sd:visualPlugins": + - image: + tab: "Split by cell type" + Caption: "Composition plot colored by cell cycle phase (split by cell type, optionally downsampled)" + umap_gr_ctyp_spl_idnt_plot_png: type: File? outputSource: ctype_assign/umap_gr_ctyp_spl_idnt_plot_png @@ -246,7 +276,7 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per dataset" + tab: "Split by dataset" Caption: "UMAP colored by cell type (split by dataset, downsampled)" cmp_gr_ctyp_spl_idnt_plot_png: @@ -260,23 +290,9 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per dataset" + tab: "Split by dataset" Caption: "Composition plot colored by cell type (split by dataset, downsampled)" - cmp_gr_idnt_spl_ctyp_plot_png: - type: File? - outputSource: ctype_assign/cmp_gr_idnt_spl_ctyp_plot_png - label: "Composition plot colored by dataset (split by cell type, downsampled)" - doc: | - Composition plot colored by dataset. - Split by cell type; downsampled to - the smallest dataset. - PNG format. - "sd:visualPlugins": - - image: - tab: "Per dataset" - Caption: "Composition plot colored by dataset (split by cell type, downsampled)" - umap_gr_ph_spl_idnt_plot_png: type: File? outputSource: ctype_assign/umap_gr_ph_spl_idnt_plot_png @@ -288,7 +304,7 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per dataset" + tab: "Split by dataset" Caption: "UMAP colored by cell cycle phase (split by dataset, downsampled)" cmp_gr_ph_spl_idnt_plot_png: @@ -302,39 +318,9 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per dataset" + tab: "Split by dataset" Caption: "Composition plot colored by cell cycle phase (split by dataset, downsampled)" - umap_gr_ctyp_spl_ph_png: - type: File? - outputSource: ctype_assign/umap_gr_ctyp_spl_ph_png - label: "UMAP colored by cell type (split by cell cycle phase, optionally downsampled)" - doc: | - UMAP colored by cell type. - Split by cell cycle phase; downsampled - to the smallest dataset (if multiple - datasets are analyzed jointly). - PNG format. - "sd:visualPlugins": - - image: - tab: "Per cell type" - Caption: "UMAP colored by cell type (split by cell cycle phase, optionally downsampled)" - - cmp_gr_ph_spl_ctyp_png: - type: File? - outputSource: ctype_assign/cmp_gr_ph_spl_ctyp_png - label: "Composition plot colored by cell cycle phase (split by cell type, optionally downsampled)" - doc: | - Composition plot colored by cell cycle phase. - Split by cell type; downsampled to the - smallest dataset (if multiple datasets are - analyzed jointly). - PNG format. - "sd:visualPlugins": - - image: - tab: "Per cell type" - Caption: "Composition plot colored by cell cycle phase (split by cell type, optionally downsampled)" - umap_gr_ctyp_spl_cnd_plot_png: type: File? outputSource: ctype_assign/umap_gr_ctyp_spl_cnd_plot_png @@ -347,7 +333,7 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per group" + tab: "Split by group" Caption: "UMAP colored by cell type (split by grouping condition, downsampled)" cmp_gr_ctyp_spl_cnd_plot_png: @@ -362,24 +348,9 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per group" + tab: "Split by group" Caption: "Composition plot colored by cell type (split by grouping condition, downsampled)" - cmp_gr_cnd_spl_ctyp_plot_png: - type: File? - outputSource: ctype_assign/cmp_gr_cnd_spl_ctyp_plot_png - label: "Composition plot colored by grouping condition (split by cell type, downsampled)" - doc: | - Composition plot colored by grouping condition. - Split by cell type; first downsampled to the - smallest dataset, then downsampled to the - smallest group. - PNG format. - "sd:visualPlugins": - - image: - tab: "Per group" - Caption: "Composition plot colored by grouping condition (split by cell type, downsampled)" - umap_gr_ph_spl_cnd_plot_png: type: File? outputSource: ctype_assign/umap_gr_ph_spl_cnd_plot_png @@ -392,7 +363,7 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per group" + tab: "Split by group" Caption: "UMAP colored by cell cycle phase (split by grouping condition, downsampled)" cmp_gr_ph_spl_cnd_plot_png: @@ -407,21 +378,9 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per group" + tab: "Split by group" Caption: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" - xpr_avg_plot_png: - type: File? - outputSource: ctype_assign/xpr_avg_plot_png - label: "Average gene expression" - doc: | - Average gene expression. - PNG format. - "sd:visualPlugins": - - image: - tab: "Gene expression" - Caption: "Average gene expression" - xpr_per_cell_plot_png: type: - "null" @@ -435,7 +394,7 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Gene expression" + tab: "Genes of interest (UMAP)" Caption: "UMAP colored by gene expression (per gene)" xpr_dnst_plot_png: @@ -451,30 +410,20 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Gene expression" + tab: "Genes of interest (violin plot)" Caption: "Gene expression density (per gene)" - xpr_htmp_plot_png: + xpr_avg_plot_png: type: File? - outputSource: ctype_assign/xpr_htmp_plot_png - label: "Gene expression heatmap (top gene markers)" + outputSource: ctype_assign/xpr_avg_plot_png + label: "Average gene expression" doc: | - Gene expression heatmap. - Top gene markers. + Average gene expression. PNG format. "sd:visualPlugins": - image: - tab: "Gene expression heatmap" - Caption: "Gene expression heatmap (top gene markers)" - - xpr_htmp_tsv: - type: File? - outputSource: ctype_assign/xpr_htmp_tsv - label: "Gene expression heatmap (top gene markers)" - doc: | - Gene expression heatmap. - Top gene markers. - TSV format. + tab: "Genes of interest (dot plot)" + Caption: "Average gene expression" cvrg_plot_png: type: @@ -489,9 +438,31 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Genome coverage" + tab: "Genes of interest (coverage plot)" Caption: "ATAC fragment coverage (per gene)" + xpr_htmp_plot_png: + type: File? + outputSource: ctype_assign/xpr_htmp_plot_png + label: "Gene expression heatmap (top gene markers)" + doc: | + Gene expression heatmap. + Top gene markers. + PNG format. + "sd:visualPlugins": + - image: + tab: "Gene markers heatmap" + Caption: "Gene expression heatmap (top gene markers)" + + xpr_htmp_tsv: + type: File? + outputSource: ctype_assign/xpr_htmp_tsv + label: "Gene expression heatmap (top gene markers)" + doc: | + Gene expression heatmap. + Top gene markers. + TSV format. + gene_markers_tsv: type: File? outputSource: ctype_assign/gene_markers_tsv @@ -501,7 +472,7 @@ outputs: TSV format. "sd:visualPlugins": - syncfusiongrid: - tab: "Gene markers" + tab: "Gene markers table" Title: "Gene markers" peak_markers_tsv: @@ -513,7 +484,7 @@ outputs: TSV format. "sd:visualPlugins": - syncfusiongrid: - tab: "Peak markers" + tab: "Peak markers table" Title: "Peak markers" ucsc_cb_html_data: @@ -650,14 +621,12 @@ steps: - umap_gr_ctyp_plot_png - umap_gr_ctyp_spl_idnt_plot_png - cmp_gr_ctyp_spl_idnt_plot_png - - cmp_gr_idnt_spl_ctyp_plot_png - umap_gr_ph_spl_idnt_plot_png - cmp_gr_ph_spl_idnt_plot_png - umap_gr_ctyp_spl_ph_png - cmp_gr_ph_spl_ctyp_png - umap_gr_ctyp_spl_cnd_plot_png - cmp_gr_ctyp_spl_cnd_plot_png - - cmp_gr_cnd_spl_ctyp_plot_png - umap_gr_ph_spl_cnd_plot_png - cmp_gr_ph_spl_cnd_plot_png - xpr_avg_plot_png @@ -740,7 +709,7 @@ $schemas: label: "Single-Cell Manual Cell Type Assignment" s:name: "Single-Cell Manual Cell Type Assignment" -s:alternateName: "Assigns identities to clustered cells" +s:alternateName: "Single-Cell Manual Cell Type Assignment" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-ctype-assign.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -781,14 +750,15 @@ doc: | Single-Cell Manual Cell Type Assignment Assigns identities to cells clustered with any of the “Single-Cell - Cluster Analysis” pipelines. For “Single-Cell RNA-Seq Cluster Analysis” - the results of this workflow are primarily used in “Single-Cell + Cluster Analysis” pipelines. For “Single-Cell RNA-Seq Cluster + Analysis” the results of this workflow are used in the “Single-Cell RNA-Seq Differential Expression Analysis”, “Single-Cell RNA-Seq - Trajectory Analysis”, and, when combined with outputs from “Cell Ranger - Count (RNA+VDJ)” or “Cell Ranger Aggregate (RNA, RNA+VDJ)” workflows – - in “Single-Cell Immune Profiling Analysis” pipelines. For “Single-Cell - ATAC-Seq Cluster Analysis” the results of this workflow are primarily - used in “Single-Cell ATAC-Seq Differential Binding Analysis” and - “Single-Cell ATAC-Seq Genome Coverage” pipelines. For “Single-Cell WNN - Cluster Analysis” – in all of the above, except the “Single-Cell - Immune Profiling Analysis” workflow. \ No newline at end of file + Trajectory Analysis”, and — when combined with outputs from the + “Cell Ranger Count (RNA+VDJ)” or “Cell Ranger Aggregate (RNA, RNA+VDJ)” + workflow — in the “Single-Cell Immune Profiling Analysis” pipeline. + For “Single-Cell ATAC-Seq Cluster Analysis”, the results of this + workflow are used in the “Single-Cell ATAC-Seq Differential + Accessibility Analysis” and “Single-Cell ATAC-Seq Genome Coverage” + pipelines. For “Single-Cell WNN Cluster Analysis”, the results of + this workflow are used in all of the above, except the “Single-Cell + Immune Profiling Analysis” pipeline. \ No newline at end of file diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index c80259d6..6dbf9543 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -55,12 +55,13 @@ inputs: default: 40 label: "Target dimensionality" doc: | - Number of principal components to be - used in constructing nearest-neighbor - graph as part of the clustering - algorithm. Accepted values range from - 1 to 50. Set to 0 to use auto-estimated - dimensionality. + Target dimensionality is the number + of principal components to be used in + constructing nearest-neighbor graph as + part of the clustering algorithm. The + accepted values range from 0 to 50. + 0 will be replaced with the auto- + estimated target dimensionality. Default: 40 resolution: @@ -68,13 +69,13 @@ inputs: default: 0.3 label: "Clustering resolution" doc: | - Resolution to define the "granularity" - of the clustered data. Larger values - lead to a bigger number of clusters. - Optimal resolution often increases - with the number of cells. For a dataset - of 3K cells, the value within 0.4-1.2 - range usually returns good results. + The resolution defines the “granularity” + of the clustered data. Larger resolution + values lead to more clusters. The optimal + resolution often increases with the number + of cells. For a dataset of 3,000 cells, a + value within the 0.3-1.2 range usually + returns good results. Default: 0.3 identify_diff_genes: @@ -82,18 +83,17 @@ inputs: default: true label: "Find gene markers" doc: | - Identify upregulated genes in each - cluster compared to all other cells. - Include only genes that are expressed - in at least 10% of the cells coming - from either current cluster or from - all other clusters together. - Exclude cells with log2FoldChange - values less than 0.25. Use Wilcoxon - Rank Sum test to calculate P-values. - Keep only genes with P-values lower - than 0.01. Adjust P-values for multiple - comparisons using Bonferroni correction. + The user can identify upregulated genes + in each cluster compared to all other + cells. The results include only genes + that are expressed in at least 10% of + the cells coming from either the current + cluster or from all other clusters together. + Genes with the log2FoldChange values smaller + than 0.25 are excluded. The p-values are + calculated with the Wilcoxon Rank Sum test + and adjusted for multiple comparisons using + the Bonferroni correction. Default: true genes_of_interest: @@ -101,9 +101,8 @@ inputs: default: null label: "Genes of interest" doc: | - Comma or space separated list of - genes of interest to visualize - expression. + A comma- or space-separated list of genes + of interest to visualize expression. Default: None color_theme: @@ -152,64 +151,6 @@ inputs: outputs: - umap_gr_ph_spl_idnt_plot_png: - type: File? - outputSource: sc_rna_cluster/umap_gr_ph_spl_idnt_plot_png - label: "UMAP colored by cell cycle phase (split by dataset, downsampled)" - doc: | - UMAP colored by cell cycle phase. - Split by dataset; downsampled to the - smallest dataset. - PNG format. - "sd:visualPlugins": - - image: - tab: "Per dataset" - Caption: "UMAP colored by cell cycle phase (split by dataset, downsampled)" - - cmp_gr_ph_spl_idnt_plot_png: - type: File? - outputSource: sc_rna_cluster/cmp_gr_ph_spl_idnt_plot_png - label: "Composition plot colored by cell cycle phase (split by dataset, downsampled)" - doc: | - Composition plot colored by cell cycle phase. - Split by dataset; downsampled to the smallest - dataset. - PNG format - "sd:visualPlugins": - - image: - tab: "Per dataset" - Caption: "Composition plot colored by cell cycle phase (split by dataset, downsampled)" - - umap_gr_ph_spl_cnd_plot_png: - type: File? - outputSource: sc_rna_cluster/umap_gr_ph_spl_cnd_plot_png - label: "UMAP colored by cell cycle phase (split by grouping condition, downsampled)" - doc: | - UMAP colored by cell cycle phase. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group. - PNG format. - "sd:visualPlugins": - - image: - tab: "Per group" - Caption: "UMAP colored by cell cycle phase (split by grouping condition, downsampled)" - - cmp_gr_ph_spl_cnd_plot_png: - type: File? - outputSource: sc_rna_cluster/cmp_gr_ph_spl_cnd_plot_png - label: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" - doc: | - Composition plot colored by cell cycle phase. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group. - PNG format. - "sd:visualPlugins": - - image: - tab: "Per group" - Caption: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" - umap_gr_clst_res_plot_png: type: - "null" @@ -223,7 +164,7 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per cluster" + tab: "Split by cluster" Caption: "UMAP colored by cluster (all cells)" slh_gr_clst_res_plot_png: @@ -239,9 +180,45 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per cluster" + tab: "Split by cluster" Caption: "Silhouette scores (all cells)" + umap_gr_clst_spl_ph_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/umap_gr_clst_spl_ph_res_plot_png + label: "UMAP colored by cluster (split by cell cycle phase, optionally downsampled)" + doc: | + UMAP colored by cluster. + Split by cell cycle phase; downsampled + to the smallest dataset (if multiple + datasets are analyzed jointly). + PNG format. + "sd:visualPlugins": + - image: + tab: "Split by cluster" + Caption: "UMAP colored by cluster (split by cell cycle phase, optionally downsampled)" + + cmp_gr_ph_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/cmp_gr_ph_spl_clst_res_plot_png + label: "Composition plot colored by cell cycle phase (split by cluster, optionally downsampled)" + doc: | + Composition plot colored by cell cycle phase. + Split by cluster; downsampled to the smallest + dataset (if multiple datasets are analyzed + jointly). + PNG format + "sd:visualPlugins": + - image: + tab: "Split by cluster" + Caption: "Composition plot colored by cell cycle phase (split by cluster, optionally downsampled)" + umap_gr_clst_spl_idnt_res_plot_png: type: - "null" @@ -256,7 +233,7 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per dataset" + tab: "Split by dataset" Caption: "UMAP colored by cluster (split by dataset, downsampled)" cmp_gr_clst_spl_idnt_res_plot_png: @@ -273,61 +250,36 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per dataset" + tab: "Split by dataset" Caption: "Composition plot colored by cluster (split by dataset, downsampled)" - cmp_gr_idnt_spl_clst_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_rna_cluster/cmp_gr_idnt_spl_clst_res_plot_png - label: "Composition plot colored by dataset (split by cluster, downsampled)" + umap_gr_ph_spl_idnt_plot_png: + type: File? + outputSource: sc_rna_cluster/umap_gr_ph_spl_idnt_plot_png + label: "UMAP colored by cell cycle phase (split by dataset, downsampled)" doc: | - Composition plot colored by dataset. - Split by cluster; downsampled to the + UMAP colored by cell cycle phase. + Split by dataset; downsampled to the smallest dataset. PNG format. "sd:visualPlugins": - image: - tab: "Per dataset" - Caption: "Composition plot colored by dataset (split by cluster, downsampled)" - - umap_gr_clst_spl_ph_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_rna_cluster/umap_gr_clst_spl_ph_res_plot_png - label: "UMAP colored by cluster (split by cell cycle phase, optionally downsampled)" - doc: | - UMAP colored by cluster. - Split by cell cycle phase; downsampled - to the smallest dataset (if multiple - datasets are analyzed jointly). - PNG format. - "sd:visualPlugins": - - image: - tab: "Per cluster" - Caption: "UMAP colored by cluster (split by cell cycle phase, optionally downsampled)" + tab: "Split by dataset" + Caption: "UMAP colored by cell cycle phase (split by dataset, downsampled)" - cmp_gr_ph_spl_clst_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_rna_cluster/cmp_gr_ph_spl_clst_res_plot_png - label: "Composition plot colored by cell cycle phase (split by cluster, optionally downsampled)" + cmp_gr_ph_spl_idnt_plot_png: + type: File? + outputSource: sc_rna_cluster/cmp_gr_ph_spl_idnt_plot_png + label: "Composition plot colored by cell cycle phase (split by dataset, downsampled)" doc: | Composition plot colored by cell cycle phase. - Split by cluster; downsampled to the smallest - dataset (if multiple datasets are analyzed - jointly). + Split by dataset; downsampled to the smallest + dataset. PNG format "sd:visualPlugins": - image: - tab: "Per cluster" - Caption: "Composition plot colored by cell cycle phase (split by cluster, optionally downsampled)" + tab: "Split by dataset" + Caption: "Composition plot colored by cell cycle phase (split by dataset, downsampled)" umap_gr_clst_spl_cnd_res_plot_png: type: @@ -344,7 +296,7 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per group" + tab: "Split by group" Caption: "UMAP colored by cluster (split by grouping condition, downsampled)" cmp_gr_clst_spl_cnd_res_plot_png: @@ -362,26 +314,38 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per group" + tab: "Split by group" Caption: "Composition plot colored by cluster (split by grouping condition, downsampled)" - cmp_gr_cnd_spl_clst_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_rna_cluster/cmp_gr_cnd_spl_clst_res_plot_png - label: "Composition plot colored by grouping condition (split by cluster, downsampled)" + umap_gr_ph_spl_cnd_plot_png: + type: File? + outputSource: sc_rna_cluster/umap_gr_ph_spl_cnd_plot_png + label: "UMAP colored by cell cycle phase (split by grouping condition, downsampled)" doc: | - Composition plot colored by grouping condition. - Split by cluster; first downsampled to the - smallest dataset, then downsampled to the - smallest group. + UMAP colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. PNG format. "sd:visualPlugins": - image: - tab: "Per group" - Caption: "Composition plot colored by grouping condition (split by cluster, downsampled)" + tab: "Split by group" + Caption: "UMAP colored by cell cycle phase (split by grouping condition, downsampled)" + + cmp_gr_ph_spl_cnd_plot_png: + type: File? + outputSource: sc_rna_cluster/cmp_gr_ph_spl_cnd_plot_png + label: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" + doc: | + Composition plot colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": + - image: + tab: "Split by group" + Caption: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" xpr_per_cell_plot_png: type: @@ -396,39 +360,39 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Gene expression" + tab: "Genes of interest (UMAP)" Caption: "UMAP colored by gene expression (per gene)" - xpr_avg_res_plot_png: + xpr_dnst_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_rna_cluster/xpr_avg_res_plot_png - label: "Average gene expression" + outputSource: sc_rna_cluster/xpr_dnst_res_plot_png + label: "Gene expression density (per gene)" doc: | - Average gene expression. + Gene expression density. + All genes of interest. PNG format. "sd:visualPlugins": - image: - tab: "Gene expression" - Caption: "Average gene expression" + tab: "Genes of interest (violin plot)" + Caption: "Gene expression density (per gene)" - xpr_dnst_res_plot_png: + xpr_avg_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_rna_cluster/xpr_dnst_res_plot_png - label: "Gene expression density (per gene)" + outputSource: sc_rna_cluster/xpr_avg_res_plot_png + label: "Average gene expression" doc: | - Gene expression density. - All genes of interest. + Average gene expression. PNG format. "sd:visualPlugins": - image: - tab: "Gene expression" - Caption: "Gene expression density (per gene)" + tab: "Genes of interest (dot plot)" + Caption: "Average gene expression" xpr_htmp_res_plot_png: type: @@ -443,7 +407,7 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Gene expression heatmap" + tab: "Gene markers heatmap" Caption: "Gene expression heatmap (top gene markers)" xpr_htmp_res_tsv: @@ -467,7 +431,7 @@ outputs: TSV format. "sd:visualPlugins": - syncfusiongrid: - tab: "Gene markers" + tab: "Gene markers table" Title: "Gene markers" ucsc_cb_html_data: @@ -578,12 +542,10 @@ steps: - slh_gr_clst_res_plot_png - umap_gr_clst_spl_idnt_res_plot_png - cmp_gr_clst_spl_idnt_res_plot_png - - cmp_gr_idnt_spl_clst_res_plot_png - umap_gr_clst_spl_ph_res_plot_png - cmp_gr_ph_spl_clst_res_plot_png - umap_gr_clst_spl_cnd_res_plot_png - cmp_gr_clst_spl_cnd_res_plot_png - - cmp_gr_cnd_spl_clst_res_plot_png - xpr_per_cell_plot_png - xpr_avg_res_plot_png - xpr_dnst_res_plot_png @@ -662,7 +624,7 @@ $schemas: label: "Single-Cell RNA-Seq Cluster Analysis" s:name: "Single-Cell RNA-Seq Cluster Analysis" -s:alternateName: "Clusters cells by similarity of gene expression data" +s:alternateName: "Single-Cell RNA-Seq Cluster Analysis" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-cluster.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -703,6 +665,9 @@ doc: | Single-Cell RNA-Seq Cluster Analysis Clusters cells by similarity of gene expression data from - the outputs of “Single-Cell RNA-Seq Dimensionality Reduction - Analysis” pipeline. The results of this workflow are primarily - used in “Single-Cell Manual Cell Type Assignment” pipeline. \ No newline at end of file + the outputs of the “Single-Cell RNA-Seq Dimensionality + Reduction Analysis” pipeline. The results of this workflow + are used in the “Single-Cell Manual Cell Type Assignment”, + “Single-Cell RNA-Seq Differential Expression Analysis”, + “Single-Cell RNA-Seq Trajectory Analysis”, and “Single-Cell + Differential Abundance Analysis” pipelines. \ No newline at end of file diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index d18b23a2..a7eb543a 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -75,9 +75,10 @@ inputs: default: 40 label: "Target RNA dimensionality" doc: | - Number of principal components to be used - in constructing weighted nearest-neighbor - graph before clustering. Accepted values + Target RNA dimensionality is the number of + principal components to be used in + constructing the weighted nearest-neighbor + graph before clustering. The accepted values range from 1 to 50. Default: 40 @@ -86,11 +87,12 @@ inputs: default: 40 label: "Target ATAC dimensionality" doc: | - Number of LSI components to be used in - constructing weighted nearest-neighbor - graph before clustering. Accepted values - range from 2 to 50. First dimension is - always excluded + Target ATAC dimensionality is the number of + LSI dimensions to be used in constructing + the weighted nearest-neighbor graph before + clustering. The accepted values range from + 2 to 50. The first dimension is always + excluded. Default: 40 resolution: @@ -98,11 +100,11 @@ inputs: default: 0.3 label: "Clustering resolution" doc: | - Resolution to define the "granularity" - of the clustered data. Larger values - lead to a bigger number of clusters. - Optimal resolution often increases - with the number of cells. + The resolution defines the “granularity” + of the clustered data. Larger resolution + values lead to more clusters. The optimal + resolution often increases with the number + of cells. Default: 0.3 identify_diff_genes: @@ -110,18 +112,17 @@ inputs: default: true label: "Find gene markers" doc: | - Identify upregulated genes in each - cluster compared to all other cells. - Include only genes that are expressed - in at least 10% of the cells coming - from either current cluster or from - all other clusters together. - Exclude cells with log2FoldChange - values less than 0.25. Use Wilcoxon - Rank Sum test to calculate P-values. - Keep only genes with P-values lower - than 0.01. Adjust P-values for multiple - comparisons using Bonferroni correction. + The user can identify upregulated genes + in each cluster compared to all other + cells. The results include only genes + that are expressed in at least 10% of + the cells coming from either the current + cluster or from all other clusters together. + Genes with the log2FoldChange values smaller + than 0.25 are excluded. The p-values are + calculated with the Wilcoxon Rank Sum test + and adjusted for multiple comparisons using + the Bonferroni correction. Default: true identify_diff_peaks: @@ -129,19 +130,16 @@ inputs: default: false label: "Find peak markers" doc: | - Identify differentially accessible - peaks in each cluster compared to - all other cells. Include only peaks - that are present in at least 5% of - the cells coming from either current - cluster or from all other clusters - together. Exclude cells with - log2FoldChange values less than 0.25. - Use logistic regression framework to - calculate P-values. Keep only genes - with P-values lower than 0.01. Adjust - P-values for multiple comparisons - using Bonferroni correction. + The user can identify differentially accessible + peaks in each cluster compared to all other cells. + The results include only peaks that are present + in at least 5% of the cells coming from either + the current cluster or from all other clusters + together. Peaks with log2FoldChange values smaller + than 0.25 are excluded. The p-values are calculated + using the logistic regression framework and adjusted + for multiple comparisons using the Bonferroni + correction. Default: false genes_of_interest: @@ -149,11 +147,11 @@ inputs: default: null label: "Genes of interest" doc: | - Comma or space separated list of genes - of interest to visualize expression and - to generate ATAC fragments coverage plots. - Ignored if "Cell Ranger RNA+ATAC Sample - (optional)" input is not provided. + A comma- or space-separated list of genes + of interest to visualize expression. If the + “Cell Ranger RNA+ATAC Sample (optional)” + input was provided the ATAC fragment coverage + plots will be created as well. Default: None color_theme: @@ -202,79 +200,57 @@ inputs: outputs: - umap_gr_ph_spl_idnt_plot_png: - type: File? - outputSource: sc_wnn_cluster/umap_gr_ph_spl_idnt_plot_png - label: "UMAP colored by cell cycle phase (split by dataset, downsampled)" - doc: | - UMAP colored by cell cycle phase. - Split by dataset; downsampled to the - smallest dataset. - PNG format. - "sd:visualPlugins": - - image: - tab: "Per dataset" - Caption: "UMAP colored by cell cycle phase (split by dataset, downsampled)" - - cmp_gr_ph_spl_idnt_plot_png: - type: File? - outputSource: sc_wnn_cluster/cmp_gr_ph_spl_idnt_plot_png - label: "Composition plot colored by cell cycle phase (split by dataset, downsampled)" - doc: | - Composition plot colored by cell cycle phase. - Split by dataset; downsampled to the smallest - dataset. - PNG format - "sd:visualPlugins": - - image: - tab: "Per dataset" - Caption: "Composition plot colored by cell cycle phase (split by dataset, downsampled)" - - umap_gr_ph_spl_cnd_plot_png: - type: File? - outputSource: sc_wnn_cluster/umap_gr_ph_spl_cnd_plot_png - label: "UMAP colored by cell cycle phase (split by grouping condition, downsampled)" + umap_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/umap_gr_clst_res_plot_png + label: "UMAP colored by cluster (all cells)" doc: | - UMAP colored by cell cycle phase. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group. + UMAP colored by cluster. + All cells. PNG format. "sd:visualPlugins": - image: - tab: "Per group" - Caption: "UMAP colored by cell cycle phase (split by grouping condition, downsampled)" + tab: "Split by cluster" + Caption: "UMAP colored by cluster (all cells)" - cmp_gr_ph_spl_cnd_plot_png: - type: File? - outputSource: sc_wnn_cluster/cmp_gr_ph_spl_cnd_plot_png - label: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" + umap_gr_clst_spl_ph_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/umap_gr_clst_spl_ph_res_plot_png + label: "UMAP colored by cluster (split by cell cycle phase, optionally downsampled)" doc: | - Composition plot colored by cell cycle phase. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group. + UMAP colored by cluster. + Split by cell cycle phase; downsampled + to the smallest dataset (if multiple + datasets are analyzed jointly). PNG format. "sd:visualPlugins": - image: - tab: "Per group" - Caption: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" + tab: "Split by cluster" + Caption: "UMAP colored by cluster (split by cell cycle phase, optionally downsampled)" - umap_gr_clst_res_plot_png: + cmp_gr_ph_spl_clst_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_wnn_cluster/umap_gr_clst_res_plot_png - label: "UMAP colored by cluster (all cells)" + outputSource: sc_wnn_cluster/cmp_gr_ph_spl_clst_res_plot_png + label: "Composition plot colored by cell cycle phase (split by cluster, optionally downsampled)" doc: | - UMAP colored by cluster. - All cells. - PNG format. + Composition plot colored by cell cycle phase. + Split by cluster; downsampled to the smallest + dataset (if multiple datasets are analyzed + jointly). + PNG format "sd:visualPlugins": - image: - tab: "Per cluster" - Caption: "UMAP colored by cluster (all cells)" + tab: "Split by cluster" + Caption: "Composition plot colored by cell cycle phase (split by cluster, optionally downsampled)" umap_gr_clst_spl_idnt_res_plot_png: type: @@ -290,7 +266,7 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per dataset" + tab: "Split by dataset" Caption: "UMAP colored by cluster (split by dataset, downsampled)" cmp_gr_clst_spl_idnt_res_plot_png: @@ -307,61 +283,36 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per dataset" + tab: "Split by dataset" Caption: "Composition plot colored by cluster (split by dataset, downsampled)" - cmp_gr_idnt_spl_clst_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_wnn_cluster/cmp_gr_idnt_spl_clst_res_plot_png - label: "Composition plot colored by dataset (split by cluster, downsampled)" + umap_gr_ph_spl_idnt_plot_png: + type: File? + outputSource: sc_wnn_cluster/umap_gr_ph_spl_idnt_plot_png + label: "UMAP colored by cell cycle phase (split by dataset, downsampled)" doc: | - Composition plot colored by dataset. - Split by cluster; downsampled to the + UMAP colored by cell cycle phase. + Split by dataset; downsampled to the smallest dataset. PNG format. "sd:visualPlugins": - image: - tab: "Per dataset" - Caption: "Composition plot colored by dataset (split by cluster, downsampled)" - - umap_gr_clst_spl_ph_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_wnn_cluster/umap_gr_clst_spl_ph_res_plot_png - label: "UMAP colored by cluster (split by cell cycle phase, optionally downsampled)" - doc: | - UMAP colored by cluster. - Split by cell cycle phase; downsampled - to the smallest dataset (if multiple - datasets are analyzed jointly). - PNG format. - "sd:visualPlugins": - - image: - tab: "Per cluster" - Caption: "UMAP colored by cluster (split by cell cycle phase, optionally downsampled)" + tab: "Split by dataset" + Caption: "UMAP colored by cell cycle phase (split by dataset, downsampled)" - cmp_gr_ph_spl_clst_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_wnn_cluster/cmp_gr_ph_spl_clst_res_plot_png - label: "Composition plot colored by cell cycle phase (split by cluster, optionally downsampled)" + cmp_gr_ph_spl_idnt_plot_png: + type: File? + outputSource: sc_wnn_cluster/cmp_gr_ph_spl_idnt_plot_png + label: "Composition plot colored by cell cycle phase (split by dataset, downsampled)" doc: | Composition plot colored by cell cycle phase. - Split by cluster; downsampled to the smallest - dataset (if multiple datasets are analyzed - jointly). + Split by dataset; downsampled to the smallest + dataset. PNG format "sd:visualPlugins": - image: - tab: "Per cluster" - Caption: "Composition plot colored by cell cycle phase (split by cluster, optionally downsampled)" + tab: "Split by dataset" + Caption: "Composition plot colored by cell cycle phase (split by dataset, downsampled)" umap_gr_clst_spl_cnd_res_plot_png: type: @@ -378,7 +329,7 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per group" + tab: "Split by group" Caption: "UMAP colored by cluster (split by grouping condition, downsampled)" cmp_gr_clst_spl_cnd_res_plot_png: @@ -396,26 +347,38 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Per group" + tab: "Split by group" Caption: "Composition plot colored by cluster (split by grouping condition, downsampled)" - cmp_gr_cnd_spl_clst_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_wnn_cluster/cmp_gr_cnd_spl_clst_res_plot_png - label: "Composition plot colored by grouping condition (split by cluster, downsampled)" + umap_gr_ph_spl_cnd_plot_png: + type: File? + outputSource: sc_wnn_cluster/umap_gr_ph_spl_cnd_plot_png + label: "UMAP colored by cell cycle phase (split by grouping condition, downsampled)" doc: | - Composition plot colored by grouping condition. - Split by cluster; first downsampled to the - smallest dataset, then downsampled to the - smallest group. + UMAP colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. PNG format. "sd:visualPlugins": - image: - tab: "Per group" - Caption: "Composition plot colored by grouping condition (split by cluster, downsampled)" + tab: "Split by group" + Caption: "UMAP colored by cell cycle phase (split by grouping condition, downsampled)" + + cmp_gr_ph_spl_cnd_plot_png: + type: File? + outputSource: sc_wnn_cluster/cmp_gr_ph_spl_cnd_plot_png + label: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" + doc: | + Composition plot colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": + - image: + tab: "Split by group" + Caption: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" xpr_per_cell_plot_png: type: @@ -430,9 +393,25 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Gene expression" + tab: "Genes of interest (UMAP)" Caption: "UMAP colored by gene expression (per gene)" + xpr_dnst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/xpr_dnst_res_plot_png + label: "Gene expression density (per gene)" + doc: | + Gene expression density. + All genes of interest. + PNG format. + "sd:visualPlugins": + - image: + tab: "Genes of interest (violin plot)" + Caption: "Gene expression density (per gene)" + xpr_avg_res_plot_png: type: - "null" @@ -445,24 +424,24 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Gene expression" + tab: "Genes of interest (dot plot)" Caption: "Average gene expression" - xpr_dnst_res_plot_png: + cvrg_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_wnn_cluster/xpr_dnst_res_plot_png - label: "Gene expression density (per gene)" + outputSource: sc_wnn_cluster/cvrg_res_plot_png + label: "ATAC fragment coverage (per gene)" doc: | - Gene expression density. + ATAC fragment coverage. All genes of interest. PNG format. "sd:visualPlugins": - image: - tab: "Gene expression" - Caption: "Gene expression density (per gene)" + tab: "Genes of interest (coverage plot)" + Caption: "ATAC fragment coverage (per gene)" xpr_htmp_res_plot_png: type: @@ -477,7 +456,7 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Gene expression heatmap" + tab: "Gene markers heatmap" Caption: "Gene expression heatmap (top gene markers)" xpr_htmp_res_tsv: @@ -492,22 +471,6 @@ outputs: Top gene markers. TSV format. - cvrg_res_plot_png: - type: - - "null" - - type: array - items: File - outputSource: sc_wnn_cluster/cvrg_res_plot_png - label: "ATAC fragment coverage (per gene)" - doc: | - ATAC fragment coverage. - All genes of interest. - PNG format. - "sd:visualPlugins": - - image: - tab: "Genome coverage" - Caption: "ATAC fragment coverage (per gene)" - gene_markers_tsv: type: File? outputSource: sc_wnn_cluster/gene_markers_tsv @@ -517,7 +480,7 @@ outputs: TSV format. "sd:visualPlugins": - syncfusiongrid: - tab: "Gene markers" + tab: "Gene markers table" Title: "Gene markers" peak_markers_tsv: @@ -529,7 +492,7 @@ outputs: TSV format. "sd:visualPlugins": - syncfusiongrid: - tab: "Peak markers" + tab: "Peak markers table" Title: "Peak markers" ucsc_cb_html_data: @@ -647,12 +610,10 @@ steps: - umap_gr_clst_res_plot_png - umap_gr_clst_spl_idnt_res_plot_png - cmp_gr_clst_spl_idnt_res_plot_png - - cmp_gr_idnt_spl_clst_res_plot_png - umap_gr_clst_spl_ph_res_plot_png - cmp_gr_ph_spl_clst_res_plot_png - umap_gr_clst_spl_cnd_res_plot_png - cmp_gr_clst_spl_cnd_res_plot_png - - cmp_gr_cnd_spl_clst_res_plot_png - xpr_per_cell_plot_png - xpr_avg_res_plot_png - xpr_dnst_res_plot_png @@ -733,7 +694,7 @@ $schemas: label: "Single-Cell WNN Cluster Analysis" s:name: "Single-Cell WNN Cluster Analysis" -s:alternateName: "Clusters multiome ATAC and RNA-Seq datasets, identifies gene markers and differentially accessible peaks" +s:alternateName: "Single-Cell WNN Cluster Analysis" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-wnn-cluster.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -773,9 +734,15 @@ s:creator: doc: | Single-Cell WNN Cluster Analysis - Clusters cells by similarity based on both gene expression and - chromatin accessibility data from the outputs of “Single-Cell - RNA-Seq Dimensionality Reduction Analysis” and “Single-Cell - ATAC-Seq Dimensionality Reduction Analysis” pipelines run - sequentially. The results of this workflow are primarily used - in “Single-Cell Manual Cell Type Assignment” pipeline. \ No newline at end of file + Clusters cells by similarity on the basis of both + gene expression and chromatin accessibility data + from the outputs of the “Single-Cell RNA-Seq + Dimensionality Reduction Analysis” and “Single-Cell + ATAC-Seq Dimensionality Reduction Analysis” pipelines + run sequentially. The results of this workflow are + used in the “Single-Cell Manual Cell Type Assignment”, + “Single-Cell RNA-Seq Differential Expression Analysis”, + “Single-Cell RNA-Seq Trajectory Analysis”, “Single-Cell + Differential Abundance Analysis”, “Single-Cell ATAC-Seq + Differential Accessibility Analysis”, and “Single-Cell + ATAC-Seq Genome Coverage” pipelines. \ No newline at end of file From 940959255ce27ee1aca160887a41fdddc5476199 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 19 Mar 2024 15:26:48 -0400 Subject: [PATCH 125/162] Update Cellranger ATAC Count pipeline to support inputs from scMultiome assay --- tools/cellranger-atac-count.cwl | 25 ++++++++++++++++++------- workflows/cellranger-atac-count.cwl | 23 ++++++++++++++++++----- 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/tools/cellranger-atac-count.cwl b/tools/cellranger-atac-count.cwl index 0125b5ba..bcddf7af 100644 --- a/tools/cellranger-atac-count.cwl +++ b/tools/cellranger-atac-count.cwl @@ -84,10 +84,21 @@ inputs: peaks as cells. N must be a positive integer <= 20,000. Please consult the documentation before using this option + chemistry: + type: string? + inputBinding: + position: 12 + prefix: "--chemistry" + doc: | + Assay configuration. NOTE: by default the assay + configuration is detected automatically. Use + "ARC-v1" to indicate that it is a library from + the multiome assay. + threads: type: int? inputBinding: - position: 12 + position: 13 prefix: "--localcores" doc: | Set max cores the pipeline may request at one time. @@ -96,7 +107,7 @@ inputs: memory_limit: type: int? inputBinding: - position: 13 + position: 14 prefix: "--localmem" doc: | Set max GB the pipeline may request at one time @@ -105,7 +116,7 @@ inputs: virt_memory_limit: type: int? inputBinding: - position: 14 + position: 15 prefix: "--localvmem" doc: | Set max virtual address space in GB for the pipeline @@ -267,9 +278,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cell Ranger ATAC Count" -s:name: "Cell Ranger ATAC Count" -s:alternateName: "Counts reads from a single scATAC-Seq library" +label: "Cell Ranger Count (ATAC)" +s:name: "Cell Ranger Count (ATAC)" +s:alternateName: "Cell Ranger Count (ATAC)" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-atac-count.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -307,7 +318,7 @@ s:creator: doc: | - Cell Ranger ATAC Count + Cell Ranger Count (ATAC) Counts reads from a single scATAC-Seq library. diff --git a/workflows/cellranger-atac-count.cwl b/workflows/cellranger-atac-count.cwl index 9a954f6b..ea15157b 100644 --- a/workflows/cellranger-atac-count.cwl +++ b/workflows/cellranger-atac-count.cwl @@ -41,6 +41,15 @@ inputs: default: 20 "sd:upstreamSource": "genome_indices/memory_limit" + multiome_arc: + type: boolean? + default: false + label: "scATAC-Seq files come from scMultiome experiment" + doc: | + Changes chemistry type parameter to indicate + that scATAC-Seq data is part of the scMultiome + experiment. + fastq_file_r1: type: - File @@ -426,6 +435,9 @@ steps: fastq_file_r3: extract_fastq_r3/fastq_file indices_folder: indices_folder force_cells: force_cells + chemistry: + source: multiome_arc + valueFrom: $(self?"ARC-v1":"null") threads: source: threads valueFrom: $(parseInt(self)) @@ -508,7 +520,7 @@ $schemas: label: "Cell Ranger Count (ATAC)" s:name: "Cell Ranger Count (ATAC)" -s:alternateName: "Quantifies single-cell chromatin accessibility of the sequencing data from a single 10x Genomics library" +s:alternateName: "Cell Ranger Count (ATAC)" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/cellranger-atac-count.cwl s:codeRepository: https://github.com/datirium/workflows @@ -548,7 +560,8 @@ s:creator: doc: | Cell Ranger Count (ATAC) - Quantifies single-cell chromatin accessibility of the sequencing - data from a single 10x Genomics library. The results of this - workflow are primarily used in “Cell Ranger Aggregate (ATAC)” - pipeline. \ No newline at end of file + Quantifies single-cell chromatin accessibility of the + sequencing data from a single 10x Genomics library. + The results of this workflow are used in either the + “Single-Cell ATAC-Seq Filtering Analysis” or “Cell + Ranger Aggregate (ATAC)” pipeline. \ No newline at end of file From d9884d2cfcfea0fe9d869e84f2a65d225c01d832 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 22 Mar 2024 15:17:53 -0400 Subject: [PATCH 126/162] Add minpct input to sc-rna-de-pseudobulk workflow --- tools/sc-rna-de-pseudobulk.cwl | 206 +++++++++++++++++------------ workflows/sc-rna-de-pseudobulk.cwl | 71 +++++----- 2 files changed, 160 insertions(+), 117 deletions(-) diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index 87dbba0f..3dc85ae0 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -23,8 +23,8 @@ inputs: doc: | Path to the RDS file to load Seurat object from. This file should include genes expression information - stored in the RNA assay. Additionally, 'rnaumap', - and/or 'atacumap', and/or 'wnnumap' dimensionality + stored in the RNA assay. Additionally, rnaumap, + and/or atacumap, and/or wnnumap dimensionality reductions should be present. datasets_metadata: @@ -34,8 +34,8 @@ inputs: doc: | Path to the TSV/CSV file to optionally extend Seurat object metadata with categorical values using samples - identities. First column - 'library_id' should - correspond to all unique values from the 'new.ident' + identities. First column - library_id should + correspond to all unique values from the new.ident column of the loaded Seurat object. If any of the provided in this file columns are already present in the Seurat object metadata, they will be overwritten. @@ -50,7 +50,7 @@ inputs: doc: | Path to the TSV/CSV file to optionally prefilter and extend Seurat object metadata by selected barcodes. - First column should be named as 'barcode'. If file + First column should be named as barcode. If file includes any other columns they will be added to the Seurat object metadata ovewriting the existing ones if those are present. Default: all cells used, no extra @@ -168,6 +168,15 @@ inputs: adjusted P-value not bigger than this value. Default: 0.05 + minimum_pct: + type: float? + inputBinding: + prefix: "--minpct" + doc: | + Include only those genes that are detected in not lower than this + fraction of cells in either of the two tested conditions. + Default: 0.1 + genes_of_interest: type: - "null" @@ -338,60 +347,66 @@ outputs: outputBinding: glob: "*_umap_rd_rnaumap.png" doc: | - UMAP, split by selected criteria, - optionally subsetted to the specific - group, RNA. - PNG format + UMAP with cells selected for analysis. + Split by selected criteria; optionally + subsetted to the specific group; + reduction rnaumap. + PNG format. umap_rd_rnaumap_plot_pdf: type: File? outputBinding: glob: "*_umap_rd_rnaumap.pdf" doc: | - UMAP, split by selected criteria, - optionally subsetted to the specific - group, RNA. - PDF format + UMAP with cells selected for analysis. + Split by selected criteria; optionally + subsetted to the specific group; + reduction rnaumap. + PDF format. umap_rd_atacumap_plot_png: type: File? outputBinding: glob: "*_umap_rd_atacumap.png" doc: | - UMAP, split by selected criteria, - optionally subsetted to the specific - group, ATAC. - PNG format + UMAP with cells selected for analysis. + Split by selected criteria; optionally + subsetted to the specific group; + reduction atacumap. + PNG format. umap_rd_atacumap_plot_pdf: type: File? outputBinding: glob: "*_umap_rd_atacumap.pdf" doc: | - UMAP, split by selected criteria, - optionally subsetted to the specific - group, ATAC. - PDF format + UMAP with cells selected for analysis. + Split by selected criteria; optionally + subsetted to the specific group; + reduction atacumap. + PDF format. umap_rd_wnnumap_plot_png: type: File? outputBinding: glob: "*_umap_rd_wnnumap.png" doc: | - UMAP, split by selected criteria, - optionally subsetted to the specific - group, WNN. - PNG format + UMAP with cells selected for analysis. + Split by selected criteria; optionally + subsetted to the specific group; + reduction wnnumap. + PNG format. umap_rd_wnnumap_plot_pdf: type: File? outputBinding: glob: "*_umap_rd_wnnumap.pdf" doc: | - UMAP, split by selected criteria, - optionally subsetted to the specific - group, WNN. - PDF format + UMAP with cells selected for analysis. + Split by selected criteria; optionally + subsetted to the specific group; + reduction wnnumap. + PDF format. mds_plot_html: type: File? @@ -400,7 +415,7 @@ outputs: doc: | MDS plot of pseudobulk aggregated normalized reads counts. - HTML format + HTML format. pca_1_2_plot_png: type: File? @@ -408,7 +423,7 @@ outputs: glob: "*_pca_1_2.png" doc: | Gene expression PCA (1,2). - PNG format + PNG format. pca_1_2_plot_pdf: type: File? @@ -416,7 +431,7 @@ outputs: glob: "*_pca_1_2.pdf" doc: | Gene expression PCA (1,2). - PDF format + PDF format. pca_2_3_plot_png: type: File? @@ -432,13 +447,14 @@ outputs: glob: "*_pca_2_3.pdf" doc: | Gene expression PCA (2,3). - PDF format + PDF format. dxpr_vlcn_plot_png: type: File? outputBinding: glob: "*_dxpr_vlcn.png" doc: | + Differentially expressed genes. Volcano plot of differentially expressed genes. Highlighed genes are either provided by user or top 10 genes with the highest log2FoldChange @@ -446,13 +462,14 @@ outputs: as --second vs --first. Cells are optionally subsetted to the specific group and optionally coerced to the pseudobulk form. - PNG format + PNG format. dxpr_vlcn_plot_pdf: type: File? outputBinding: glob: "*_dxpr_vlcn.pdf" doc: | + Differentially expressed genes. Volcano plot of differentially expressed genes. Highlighed genes are either provided by user or top 10 genes with the highest log2FoldChange @@ -460,31 +477,33 @@ outputs: as --second vs --first. Cells are optionally subsetted to the specific group and optionally coerced to the pseudobulk form. - PDF format + PDF format. xpr_dnst_plot_png: type: File? outputBinding: glob: "*_xpr_dnst.png" doc: | + Gene expression density. Gene expression violin plots for either user provided or top 10 differentially expressed genes with the highest log2FoldChange values. The direction of comparison is defined as --second vs --first. - PNG format + PNG format. xpr_dnst_plot_pdf: type: File? outputBinding: glob: "*_xpr_dnst.pdf" doc: | + Gene expression density. Gene expression violin plots for either user provided or top 10 differentially expressed genes with the highest log2FoldChange values. The direction of comparison is defined as --second vs --first. - PDF format + PDF format. xpr_per_cell_rd_rnaumap_plot_png: type: @@ -494,10 +513,11 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_rnaumap_*.png" doc: | - UMAP, gene expression, split by selected - criteria, optionally subsetted to the - specific group, RNA. - PNG format + UMAP colored by gene expression. + Split by selected criteria; optionally + subsetted to the specific group; + reduction rnaumap. + PNG format. xpr_per_cell_rd_rnaumap_plot_pdf: type: @@ -507,10 +527,11 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_rnaumap_*.pdf" doc: | - UMAP, gene expression, split by selected - criteria, optionally subsetted to the - specific group, RNA. - PDF format + UMAP colored by gene expression. + Split by selected criteria; optionally + subsetted to the specific group; + reduction rnaumap. + PDF format. xpr_per_cell_rd_atacumap_plot_png: type: @@ -520,10 +541,11 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_atacumap_*.png" doc: | - UMAP, gene expression, split by selected - criteria, optionally subsetted to the - specific group, ATAC. - PNG format + UMAP colored by gene expression. + Split by selected criteria; optionally + subsetted to the specific group; + reduction atacumap. + PNG format. xpr_per_cell_rd_atacumap_plot_pdf: type: @@ -533,10 +555,11 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_atacumap_*.pdf" doc: | - UMAP, gene expression, split by selected - criteria, optionally subsetted to the - specific group, ATAC. - PDF format + UMAP colored by gene expression. + Split by selected criteria; optionally + subsetted to the specific group; + reduction atacumap. + PDF format. xpr_per_cell_rd_wnnumap_plot_png: type: @@ -546,10 +569,11 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_wnnumap_*.png" doc: | - UMAP, gene expression, split by selected - criteria, optionally subsetted to the - specific group, WNN. - PNG format + UMAP colored by gene expression. + Split by selected criteria; optionally + subsetted to the specific group; + reduction wnnumap. + PNG format. xpr_per_cell_rd_wnnumap_plot_pdf: type: @@ -559,30 +583,31 @@ outputs: outputBinding: glob: "*_xpr_per_cell_rd_wnnumap_*.pdf" doc: | - UMAP, gene expression, split by selected - criteria, optionally subsetted to the - specific group, WNN. - PDF format + UMAP colored by gene expression. + Split by selected criteria; optionally + subsetted to the specific group; + reduction wnnumap. + PDF format. xpr_htmp_plot_png: type: File? outputBinding: glob: "*_xpr_htmp.png" doc: | - Gene expression heatmap, filtered by adjusted - P-value, optionally subsetted to the specific - groups of cells. - PNG format + Gene expression heatmap. + Filtered by adjusted p-value; optionally + subsetted to the specific groups. + PNG format. xpr_htmp_plot_pdf: type: File? outputBinding: glob: "*_xpr_htmp.pdf" doc: | - Gene expression heatmap, filtered by adjusted - P-value, optionally subsetted to the specific - groups of cells. - PDF format + Gene expression heatmap. + Filtered by adjusted p-value; optionally + subsetted to the specific groups. + PDF format. diff_expr_genes: type: File? @@ -590,8 +615,8 @@ outputs: glob: "*_de_genes.tsv" doc: | Differentially expressed genes. - Not filtered by adjusted P-value. - TSV format + Not filtered by adjusted p-value. + TSV format. bulk_read_counts_gct: type: File? @@ -599,17 +624,19 @@ outputs: glob: "*_bulk_counts.gct" doc: | GSEA compatible not filtered normalized - reads counts aggregated to pseudobulk form. - GCT format + reads counts aggregated to pseudobulk + form. + GCT format. bulk_phenotypes_cls: type: File? outputBinding: glob: "*_bulk_phntps.cls" doc: | - GSEA compatible phenotypes file defined based - on --splitby, --first, and --second parameters. - CLS format + GSEA compatible phenotypes file defined + based on --splitby, --first, and --second + parameters. + CLS format. cell_read_counts_gct: type: File? @@ -617,7 +644,7 @@ outputs: glob: "*_cell_counts.gct" doc: | Filtered normalized reads counts per cell. - GCT format + GCT format. stdout_log: type: stdout @@ -641,7 +668,7 @@ $schemas: label: "Single-Cell RNA-Seq Differential Expression Analysis" s:name: "Single-Cell RNA-Seq Differential Expression Analysis" -s:alternateName: "Identifies differentially expressed genes between two groups of cells optionally coerced to pseudobulk form" +s:alternateName: "Single-Cell RNA-Seq Differential Expression Analysis" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-rna-de-pseudobulk.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -681,12 +708,13 @@ s:creator: doc: | Single-Cell RNA-Seq Differential Expression Analysis - Identifies differentially expressed genes between two - groups of cells optionally coerced to pseudobulk form + Identifies differentially expressed genes between any + two groups of cells, optionally aggregating gene + expression data from single-cell to pseudobulk form. s:about: | - usage: /usr/local/bin/sc_rna_de_pseudobulk.R [-h] --query QUERY + usage: sc_rna_de_pseudobulk.R [-h] --query QUERY [--metadata METADATA] [--barcodes BARCODES] [--groupby GROUPBY] @@ -695,6 +723,7 @@ s:about: | --second SECOND [--test {wilcoxon,likelihood-ratio,t-test,negative-binomial,poisson,logistic-regression,mast,deseq,deseq-lrt}] [--batchby BATCHBY] [--padj PADJ] + [--minpct MINPCT] [--genes [GENES [GENES ...]]] [--exclude EXCLUDE] [--cluster {row,column,both}] @@ -712,13 +741,13 @@ s:about: | -h, --help show this help message and exit --query QUERY Path to the RDS file to load Seurat object from. This file should include genes expression information - stored in the RNA assay. Additionally, 'rnaumap', - and/or 'atacumap', and/or 'wnnumap' dimensionality - reductions should be present. + stored in the RNA assay. Additionally, rnaumap, and/or + atacumap, and/or wnnumap dimensionality reductions + should be present. --metadata METADATA Path to the TSV/CSV file to optionally extend Seurat object metadata with categorical values using samples - identities. First column - 'library_id' should - correspond to all unique values from the 'new.ident' + identities. First column - library_id should + correspond to all unique values from the new.ident column of the loaded Seurat object. If any of the provided in this file columns are already present in the Seurat object metadata, they will be overwritten. @@ -727,7 +756,7 @@ s:about: | be applied. Default: no extra metadata is added --barcodes BARCODES Path to the TSV/CSV file to optionally prefilter and extend Seurat object metadata by selected barcodes. - First column should be named as 'barcode'. If file + First column should be named as barcode. If file includes any other columns they will be added to the Seurat object metadata ovewriting the existing ones if those are present. Default: all cells used, no extra @@ -783,6 +812,9 @@ s:about: | output only differentially expressed genes with adjusted P-value not bigger than this value. Default: 0.05 + --minpct MINPCT Include only those genes that are detected in not + lower than this fraction of cells in either of the two + tested conditions. Default: 0.1 --genes [GENES [GENES ...]] Genes of interest to label on the generated plots. Default: top 10 genes with the highest and the lowest diff --git a/workflows/sc-rna-de-pseudobulk.cwl b/workflows/sc-rna-de-pseudobulk.cwl index c2332196..6cbe2842 100644 --- a/workflows/sc-rna-de-pseudobulk.cwl +++ b/workflows/sc-rna-de-pseudobulk.cwl @@ -182,12 +182,22 @@ inputs: maximum_padj: type: float? default: 0.05 - label: "Maximum adjusted P-value" + label: "Maximum adjusted p-value" doc: | - Maximum adjusted P-value threshold for + Maximum adjusted p-value threshold for selecting differentially expressed genes to be visualized on the heatmap. + minimum_pct: + type: float? + default: 0.1 + label: "Minimum fraction of cells where a gene should be expressed" + doc: | + Minimum fraction of cells in either of + the two tested conditions where the gene + should be expressed to be included into + analysis. + enable_clustering: type: boolean? default: false @@ -344,8 +354,8 @@ outputs: in PNG format "sd:visualPlugins": - image: - tab: 'QC' - Caption: 'Gene expression PCA (1,2)' + tab: "QC" + Caption: "Gene expression PCA (1,2)" pca_2_3_plot_png: type: File? @@ -356,8 +366,8 @@ outputs: in PNG format "sd:visualPlugins": - image: - tab: 'QC' - Caption: 'Gene expression PCA (2,3)' + tab: "QC" + Caption: "Gene expression PCA (2,3)" umap_rd_rnaumap_plot_png: type: File? @@ -372,8 +382,8 @@ outputs: PNG format "sd:visualPlugins": - image: - tab: 'QC' - Caption: 'UMAP, split by comparison category, RNA' + tab: "QC" + Caption: "UMAP, split by comparison category, RNA" umap_rd_atacumap_plot_png: type: File? @@ -388,8 +398,8 @@ outputs: PNG format "sd:visualPlugins": - image: - tab: 'QC' - Caption: 'UMAP, split by comparison category, ATAC' + tab: "QC" + Caption: "UMAP, split by comparison category, ATAC" umap_rd_wnnumap_plot_png: type: File? @@ -404,8 +414,8 @@ outputs: PNG format "sd:visualPlugins": - image: - tab: 'QC' - Caption: 'UMAP, split by comparison category, WNN' + tab: "QC" + Caption: "UMAP, split by comparison category, WNN" dxpr_vlcn_plot_png: type: File? @@ -419,7 +429,7 @@ outputs: "sd:visualPlugins": - image: tab: "Genes of interest" - Caption: 'Volcano plot of differentially expressed genes' + Caption: "Volcano plot of differentially expressed genes" xpr_dnst_plot_png: type: File? @@ -434,7 +444,7 @@ outputs: "sd:visualPlugins": - image: tab: "Genes of interest" - Caption: 'Gene expression violin plot' + Caption: "Gene expression violin plot" xpr_htmp_plot_png: type: File? @@ -442,13 +452,13 @@ outputs: label: "Gene expression heatmap" doc: | Gene expression heatmap, filtered - by adjusted P-value, optionally + by adjusted p-value, optionally subsetted to the specific groups of cells in PNG format "sd:visualPlugins": - image: - tab: 'Heatmap' - Caption: 'Gene expression heatmap' + tab: "Heatmap" + Caption: "Gene expression heatmap" xpr_per_cell_rd_rnaumap_plot_png: type: @@ -463,8 +473,8 @@ outputs: specific group, RNA, PNG format "sd:visualPlugins": - image: - tab: 'Gene expression, RNA' - Caption: 'UMAP, gene expression, RNA' + tab: "Gene expression, RNA" + Caption: "UMAP, gene expression, RNA" xpr_per_cell_rd_atacumap_plot_png: type: @@ -479,8 +489,8 @@ outputs: specific group, ATAC, PNG format "sd:visualPlugins": - image: - tab: 'Gene expression, ATAC' - Caption: 'UMAP, gene expression, ATAC' + tab: "Gene expression, ATAC" + Caption: "UMAP, gene expression, ATAC" xpr_per_cell_rd_wnnumap_plot_png: type: @@ -495,21 +505,21 @@ outputs: specific group, WNN, PNG format "sd:visualPlugins": - image: - tab: 'Gene expression, WNN' - Caption: 'UMAP, gene expression, WNN' + tab: "Gene expression, WNN" + Caption: "UMAP, gene expression, WNN" diff_expr_genes: type: File outputSource: de_pseudobulk/diff_expr_genes label: "Differentially expressed genes" doc: | - Not filtered by adjusted P-value + Not filtered by adjusted p-value differentially expressed genes in TSV format "sd:visualPlugins": - syncfusiongrid: - tab: 'Diff. expressed genes' - Title: 'Differentially expressed genes' + tab: "Diff. expressed genes" + Title: "Differentially expressed genes" read_counts_file: type: File? @@ -600,6 +610,7 @@ steps: source: batchby valueFrom: $(self==""?null:self) # safety measure maximum_padj: maximum_padj + minimum_pct: minimum_pct genes_of_interest: source: genes_of_interest valueFrom: $(split_features(self)) @@ -731,7 +742,7 @@ $schemas: label: "Single-Cell RNA-Seq Differential Expression Analysis" s:name: "Single-Cell RNA-Seq Differential Expression Analysis" -s:alternateName: "Identifies differentially expressed genes between two groups of cells optionally coerced to pseudobulk form" +s:alternateName: "Single-Cell RNA-Seq Differential Expression Analysis" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-de-pseudobulk.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -771,6 +782,6 @@ s:creator: doc: | Single-Cell RNA-Seq Differential Expression Analysis - Identifies differentially expressed genes between any two - groups of cells, optionally aggregating gene expression - data from single-cell to pseudobulk form. \ No newline at end of file + Identifies differentially expressed genes between any + two groups of cells, optionally aggregating gene + expression data from single-cell to pseudobulk form. \ No newline at end of file From 132ba264743fd4922897eddd67fb6d237ca21731 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 22 Mar 2024 15:31:17 -0400 Subject: [PATCH 127/162] Deacrease default threads to 4 in all sc related workflows --- workflows/cellranger-aggr.cwl | 2 +- workflows/cellranger-arc-aggr.cwl | 2 +- workflows/cellranger-arc-count.cwl | 2 +- workflows/cellranger-atac-aggr.cwl | 2 +- workflows/cellranger-atac-count.cwl | 2 +- workflows/cellranger-mkref.cwl | 3 +-- workflows/cellranger-multi.cwl | 2 +- workflows/cellranger-reanalyze.cwl | 2 +- workflows/sc-atac-cluster.cwl | 4 ++-- workflows/sc-atac-coverage.cwl | 4 ++-- workflows/sc-atac-dbinding.cwl | 4 ++-- workflows/sc-atac-reduce.cwl | 4 ++-- workflows/sc-ctype-assign.cwl | 4 ++-- workflows/sc-multiome-filter.cwl | 4 ++-- workflows/sc-rna-cluster.cwl | 4 ++-- workflows/sc-rna-da-cells.cwl | 4 ++-- workflows/sc-rna-de-pseudobulk.cwl | 4 ++-- workflows/sc-rna-filter.cwl | 4 ++-- workflows/sc-rna-reduce.cwl | 4 ++-- workflows/sc-rna-trajectory.cwl | 4 ++-- workflows/sc-triangulate.cwl | 4 ++-- workflows/sc-vdj-profile.cwl | 4 ++-- workflows/sc-wnn-cluster.cwl | 4 ++-- workflows/single-cell-preprocess-cellranger.cwl | 4 ++-- 24 files changed, 40 insertions(+), 41 deletions(-) diff --git a/workflows/cellranger-aggr.cwl b/workflows/cellranger-aggr.cwl index 77624894..cb31e4e7 100644 --- a/workflows/cellranger-aggr.cwl +++ b/workflows/cellranger-aggr.cwl @@ -79,7 +79,7 @@ inputs: threads: type: int? - default: 6 + default: 4 label: "Number of threads" doc: "Number of threads for those steps that support multithreading" "sd:layout": diff --git a/workflows/cellranger-arc-aggr.cwl b/workflows/cellranger-arc-aggr.cwl index 3d543f0d..e1195fe4 100644 --- a/workflows/cellranger-arc-aggr.cwl +++ b/workflows/cellranger-arc-aggr.cwl @@ -102,7 +102,7 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Cores/CPUs" doc: | Parallelization parameter to define the diff --git a/workflows/cellranger-arc-count.cwl b/workflows/cellranger-arc-count.cwl index 934e9b8b..17c69123 100644 --- a/workflows/cellranger-arc-count.cwl +++ b/workflows/cellranger-arc-count.cwl @@ -131,7 +131,7 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Cores/CPUs" doc: | Parallelization parameter to define the diff --git a/workflows/cellranger-atac-aggr.cwl b/workflows/cellranger-atac-aggr.cwl index 04ba9aea..1553a264 100644 --- a/workflows/cellranger-atac-aggr.cwl +++ b/workflows/cellranger-atac-aggr.cwl @@ -94,7 +94,7 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Cores/CPUs" doc: | Parallelization parameter to define the diff --git a/workflows/cellranger-atac-count.cwl b/workflows/cellranger-atac-count.cwl index ea15157b..72705def 100644 --- a/workflows/cellranger-atac-count.cwl +++ b/workflows/cellranger-atac-count.cwl @@ -96,7 +96,7 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Cores/CPUs" doc: | Parallelization parameter to define the diff --git a/workflows/cellranger-mkref.cwl b/workflows/cellranger-mkref.cwl index 013e1ad1..20b99c0f 100644 --- a/workflows/cellranger-mkref.cwl +++ b/workflows/cellranger-mkref.cwl @@ -12,7 +12,6 @@ requirements: "sd:upstream": genome_indices: - "genome-indices.cwl" - - "https://github.com/datirium/workflows/workflows/genome-indices.cwl" inputs: @@ -58,7 +57,7 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Cores/CPUs" doc: | Parallelization parameter to define the diff --git a/workflows/cellranger-multi.cwl b/workflows/cellranger-multi.cwl index 76c73f56..073403e6 100644 --- a/workflows/cellranger-multi.cwl +++ b/workflows/cellranger-multi.cwl @@ -123,7 +123,7 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Cores/CPUs" doc: | Parallelization parameter to define the diff --git a/workflows/cellranger-reanalyze.cwl b/workflows/cellranger-reanalyze.cwl index 62fedddd..3d63b6bd 100644 --- a/workflows/cellranger-reanalyze.cwl +++ b/workflows/cellranger-reanalyze.cwl @@ -391,7 +391,7 @@ inputs: threads: type: int? - default: 6 + default: 4 label: "Number of threads" doc: "Number of threads for those steps that support multithreading" "sd:layout": diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index 2ba9fc44..dbfde3e3 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -161,13 +161,13 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 6 + Default: 4 "sd:layout": advanced: true diff --git a/workflows/sc-atac-coverage.cwl b/workflows/sc-atac-coverage.cwl index 14ddccde..d83437f1 100644 --- a/workflows/sc-atac-coverage.cwl +++ b/workflows/sc-atac-coverage.cwl @@ -127,13 +127,13 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Number of cores/cpus to use" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 6 + Default: 4 "sd:layout": advanced: true diff --git a/workflows/sc-atac-dbinding.cwl b/workflows/sc-atac-dbinding.cwl index 5699e0c6..e98462bb 100644 --- a/workflows/sc-atac-dbinding.cwl +++ b/workflows/sc-atac-dbinding.cwl @@ -262,13 +262,13 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Number of cores/cpus to use" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 6 + Default: 4 "sd:layout": advanced: true diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index a506a983..c4e0ca8b 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -214,13 +214,13 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 6 + Default: 4 "sd:layout": advanced: true diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index dc87ed2b..cb56a41f 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -209,13 +209,13 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 6 + Default: 4 "sd:layout": advanced: true diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index ebf0acac..606e8869 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -469,13 +469,13 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 6 + Default: 4 "sd:layout": advanced: true diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index 6dbf9543..19e1dcaf 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -138,13 +138,13 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 6 + Default: 4 "sd:layout": advanced: true diff --git a/workflows/sc-rna-da-cells.cwl b/workflows/sc-rna-da-cells.cwl index 08ec4421..eb2d8786 100644 --- a/workflows/sc-rna-da-cells.cwl +++ b/workflows/sc-rna-da-cells.cwl @@ -137,13 +137,13 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Number of cores/cpus to use" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 6 + Default: 4 "sd:layout": advanced: true diff --git a/workflows/sc-rna-de-pseudobulk.cwl b/workflows/sc-rna-de-pseudobulk.cwl index 6cbe2842..179e54a4 100644 --- a/workflows/sc-rna-de-pseudobulk.cwl +++ b/workflows/sc-rna-de-pseudobulk.cwl @@ -290,13 +290,13 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 6 + Default: 4 "sd:layout": advanced: true diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index 068db192..066bd100 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -269,13 +269,13 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Number of cores/cpus to use" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 6 + Default: 4 "sd:layout": advanced: true diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index e180b59d..7460ed65 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -290,13 +290,13 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 6 + Default: 4 "sd:layout": advanced: true diff --git a/workflows/sc-rna-trajectory.cwl b/workflows/sc-rna-trajectory.cwl index 5ba0aa5e..78236386 100644 --- a/workflows/sc-rna-trajectory.cwl +++ b/workflows/sc-rna-trajectory.cwl @@ -152,13 +152,13 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 6 + Default: 4 "sd:layout": advanced: true diff --git a/workflows/sc-triangulate.cwl b/workflows/sc-triangulate.cwl index f46ab915..59bbc8df 100644 --- a/workflows/sc-triangulate.cwl +++ b/workflows/sc-triangulate.cwl @@ -113,13 +113,13 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Number of cores/cpus to use" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 6 + Default: 4 "sd:layout": advanced: true diff --git a/workflows/sc-vdj-profile.cwl b/workflows/sc-vdj-profile.cwl index 57ae8e1e..31ad21cf 100644 --- a/workflows/sc-vdj-profile.cwl +++ b/workflows/sc-vdj-profile.cwl @@ -172,13 +172,13 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 6 + Default: 4 "sd:layout": advanced: true diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index a7eb543a..48a871f3 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -187,13 +187,13 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 6 + Default: 4 "sd:layout": advanced: true diff --git a/workflows/single-cell-preprocess-cellranger.cwl b/workflows/single-cell-preprocess-cellranger.cwl index 488eb1ef..de0af624 100644 --- a/workflows/single-cell-preprocess-cellranger.cwl +++ b/workflows/single-cell-preprocess-cellranger.cwl @@ -92,13 +92,13 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 6 + Default: 4 "sd:layout": advanced: true From d0e0c10f687df261c4aef4c69ab23726bd55fef7 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 28 Mar 2024 16:21:42 -0400 Subject: [PATCH 128/162] Add Single-Cell ATAC-Seq Filtering Analysis pipeline --- tools/sc-atac-cluster.cwl | 2 +- tools/sc-atac-coverage.cwl | 2 +- tools/sc-atac-dbinding.cwl | 2 +- tools/sc-atac-filter.cwl | 1038 ++++++++++++++++++++++++++ tools/sc-atac-reduce.cwl | 2 +- tools/sc-ctype-assign.cwl | 2 +- tools/sc-multiome-filter.cwl | 1164 ++++++++---------------------- tools/sc-rna-cluster.cwl | 2 +- tools/sc-rna-da-cells.cwl | 2 +- tools/sc-rna-de-pseudobulk.cwl | 2 +- tools/sc-rna-filter.cwl | 417 +++-------- tools/sc-rna-reduce.cwl | 2 +- tools/sc-rna-trajectory.cwl | 2 +- tools/sc-triangulate.cwl | 2 +- tools/sc-vdj-profile.cwl | 2 +- tools/sc-wnn-cluster.cwl | 2 +- workflows/sc-atac-filter.cwl | 936 ++++++++++++++++++++++++ workflows/sc-atac-reduce.cwl | 1 + workflows/sc-multiome-filter.cwl | 727 +++++++++---------- workflows/sc-rna-filter.cwl | 393 +++++----- 20 files changed, 2933 insertions(+), 1769 deletions(-) create mode 100644 tools/sc-atac-filter.cwl create mode 100644 workflows/sc-atac-filter.cwl diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index 18d96e62..931a7394 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.34 + dockerPull: biowardrobe2/sc-tools:v0.0.35 inputs: diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index 5f97216b..e7a2e2b2 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.34 + dockerPull: biowardrobe2/sc-tools:v0.0.35 inputs: diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index bc73f8ff..8d97e2f5 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.34 + dockerPull: biowardrobe2/sc-tools:v0.0.35 inputs: diff --git a/tools/sc-atac-filter.cwl b/tools/sc-atac-filter.cwl new file mode 100644 index 00000000..c2b024d8 --- /dev/null +++ b/tools/sc-atac-filter.cwl @@ -0,0 +1,1038 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement +- class: InitialWorkDirRequirement + listing: + - entryname: dummy_metadata.csv + entry: | + library_id + Experiment +- class: EnvVarRequirement + envDef: + R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/sc-tools:v0.0.35 + + +inputs: + + feature_bc_matrices_folder: + type: Directory + inputBinding: + prefix: "--mex" + doc: | + Path to the folder with feature-barcode matrix from Cell Ranger Count (ATAC), + Cell Ranger Count (RNA+ATAC), Cell Ranger Aggregate (ATAC), or Cell Ranger + Aggregate (RNA+ATAC) experiment in MEX format. For RNA+ATAC experiments the + rows consisting genes will be ignored. + + aggregation_metadata: + type: File? + doc: | + Path to the metadata TSV/CSV file to set the datasets identities. If --mex points + to the Cell Ranger Aggregate (ATAC) or Cell Ranger Aggregate (RNA+ATAC) outputs, + the aggr.csv file can be used. If Cell Ranger Count (ATAC) or Cell Ranger Count + (RNA+ATAC) outputs have been used in the --mex input, the file should include at + least one column - library_id and one row with the alias for that experiment. + + atac_fragments_file: + type: File + secondaryFiles: + - .tbi + inputBinding: + prefix: "--fragments" + doc: | + Count and barcode information for every ATAC fragment observed in the experiment in TSV + format. Tbi-index file is required. + + annotation_gtf_file: + type: File + inputBinding: + prefix: "--annotations" + doc: | + Path to the genome annotation file in GTF format. + + chrom_length_file: + type: File + inputBinding: + prefix: "--seqinfo" + doc: | + Path to the headerless chromosome length file in TSV format + + grouping_data: + type: File? + inputBinding: + prefix: "--grouping" + doc: | + Path to the TSV/CSV file to define datasets grouping. + First column - 'library_id' with the values and order + that correspond to the 'library_id' column from the ' + --identity' file, second column 'condition'. + Default: each dataset is assigned to its own group. + + blacklist_regions_file: + type: + - "null" + - File + - type: enum + symbols: + - "hg19" + - "hg38" + - "mm10" + inputBinding: + prefix: "--blacklist" + valueFrom: | + ${ + if (self.class && self.class == "File"){ + return self; + } else if (self == "hg19") { + return "/opt/sc_tools/hg19-blacklist.v2.bed"; + } else if (self == "hg38") { + return "/opt/sc_tools/hg38-blacklist.v2.bed"; + } else if (self == "mm10") { + return "/opt/sc_tools/mm10-blacklist.v2.bed"; + } else { + return null; + } + } + doc: | + Path to the optional BED file with the genomic blacklist regions. + If a string value provided, it should be one of the hg19, hg38, + or mm10 as we replace it with the file location from docker image + + barcodes_data: + type: File? + inputBinding: + prefix: "--barcodes" + doc: | + Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata be selected barcodes. + First column should be named as barcode. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. + Default: all cells used, no extra metadata is added + + atac_minimum_cells: + type: int? + inputBinding: + prefix: "--atacmincells" + doc: | + Include only peaks detected in at least this many cells. + Default: 5 (applied to all datasets) + + minimum_fragments: + type: + - "null" + - int + - int[] + inputBinding: + prefix: "--minfragments" + doc: | + Include cells where at least this many ATAC fragments in peaks are + detected. If multiple values provided, each of them will be + applied to the correspondent dataset from the '--mex' input + based on the '--identity' file. Any 0 will be replaced with the + auto-estimated threshold (median - 2.5 * MAD) calculated per dataset. + Default: 1000 (applied to all datasets) + + maximum_nucl_signal: + type: + - "null" + - float + - float[] + inputBinding: + prefix: "--maxnuclsignal" + doc: | + Include cells with the nucleosome signal not bigger than this value. + Nucleosome signal quantifies the approximate ratio of mononucleosomal + to nucleosome-free ATAC fragments. If multiple values provided, each of + them will be applied to the correspondent dataset from the '--mex' input + based on the '--identity' file. + Default: 4 (applied to all datasets) + + minimum_tss_enrich: + type: + - "null" + - float + - float[] + inputBinding: + prefix: "--mintssenrich" + doc: | + Include cells with the TSS enrichment score not lower than this value. + Score is calculated based on the ratio of ATAC fragments centered at the TSS + to ATAC fragments in TSS-flanking regions. If multiple values provided, each + of them will be applied to the correspondent dataset from the '--mex' input + based on the '--identity' file. + Default: 2 (applied to all datasets) + + minimum_frip: + type: float? + inputBinding: + prefix: "--minfrip" + doc: | + Include cells with the FRiP not lower than this + value. FRiP is calculated for ATAC fragments. + Default: 0.15 (applied to all datasets) + + maximum_blacklist_fraction: + type: + - "null" + - float + - float[] + inputBinding: + prefix: "--maxblacklist" + doc: | + Include cells with the fraction of ATAC fragments in + genomic blacklist regions not bigger than this value. + If multiple values provided, each of them will be + applied to the correspondent dataset from the '--mex' + input based on the '--identity' file. + Default: 0.05 (applied to all datasets) + + call_by: + type: string? + inputBinding: + prefix: "--callby" + doc: | + Replace Cell Ranger peaks with MACS2 peaks called + for cells grouped by the column from the optionally + provided --barcodes file. If --barcodes file was not + provided MACS2 peaks can be still called per dataset + by setting --callby to new.ident. Peaks are called + only after applying maximum nucleosome signal and + minimum TSS enrichment scores filters. + Default: do not call peaks + + minimum_qvalue: + type: float? + inputBinding: + prefix: "--qvalue" + doc: | + Minimum FDR (q-value) cutoff for MACS2 peak detection. + Ignored if --callby is not provided. Default: 0.05 + + remove_doublets: + type: boolean? + inputBinding: + prefix: "--removedoublets" + doc: | + Remove cells that were identified as doublets. + Default: do not remove doublets + + atac_doublet_rate: + type: float? + inputBinding: + prefix: "--atacdbr" + doc: | + Expected ATAC doublet rate. Default: 1 percent per thousand + cells captured with 10x genomics + + atac_doublet_rate_sd: + type: float? + inputBinding: + prefix: "--atacdbrsd" + doc: | + Uncertainty range in the ATAC doublet rate, interpreted as + a +/- around the value provided in --atacdbr. Set to 0 to + disable. Set to 1 to make the threshold depend entirely + on the misclassification rate. Default: 40 percents of the + value provided in --atacdbr + + export_pdf_plots: + type: boolean? + inputBinding: + prefix: "--pdf" + doc: | + Export plots in PDF. + Default: false + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + inputBinding: + prefix: "--theme" + doc: | + Color theme for all generated plots. One of gray, bw, linedraw, light, + dark, minimal, classic, void. + Default: classic + + verbose: + type: boolean? + inputBinding: + prefix: "--verbose" + doc: | + Print debug information. + Default: false + + export_h5seurat_data: + type: boolean? + inputBinding: + prefix: "--h5seurat" + doc: | + Save Seurat data to h5seurat file. + Default: false + + export_h5ad_data: + type: boolean? + inputBinding: + prefix: "--h5ad" + doc: | + Save raw counts from the ATAC assay to h5ad file. + Default: false + + export_ucsc_cb: + type: boolean? + inputBinding: + prefix: "--cbbuild" + doc: | + Export results to UCSC Cell Browser. Default: false + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix. + Default: ./sc + + parallel_memory_limit: + type: int? + inputBinding: + prefix: "--memory" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Default: 32 + + vector_memory_limit: + type: int? + default: 128 + doc: | + Maximum vector memory in GB allowed to be used by R. + Default: 128 + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. + Default: 1 + + seed: + type: int? + inputBinding: + prefix: "--seed" + doc: | + Seed number for random values. + Default: 42 + + +outputs: + + raw_1_2_qc_mtrcs_pca_plot_png: + type: File? + outputBinding: + glob: "*_raw_1_2_qc_mtrcs_pca.png" + doc: | + QC metrics PCA. + Unfiltered; PC1/PC2. + PNG format. + + raw_2_3_qc_mtrcs_pca_plot_png: + type: File? + outputBinding: + glob: "*_raw_2_3_qc_mtrcs_pca.png" + doc: | + QC metrics PCA. + Unfiltered; PC2/PC3. + PNG format. + + raw_cells_count_plot_png: + type: File? + outputBinding: + glob: "*_raw_cells_count.png" + doc: | + Number of cells per dataset. + Unfiltered. + PNG format. + + raw_frgm_dnst_plot_png: + type: File? + outputBinding: + glob: "*_raw_frgm_dnst.png" + doc: | + Distribution of ATAC fragments in peaks + per cell. + Unfiltered. + PNG format. + + raw_peak_dnst_plot_png: + type: File? + outputBinding: + glob: "*_raw_peak_dnst.png" + doc: | + Distribution of peaks per cell. + Unfiltered. + PNG format. + + raw_blck_dnst_plot_png: + type: File? + outputBinding: + glob: "*_raw_blck_dnst.png" + doc: | + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Unfiltered. + PNG format. + + raw_tss_frgm_plot_png: + type: File? + outputBinding: + glob: "*_raw_tss_frgm.png" + doc: | + TSS enrichment score vs ATAC + fragments in peaks per cell. + Unfiltered. + PNG format. + + raw_qc_mtrcs_dnst_plot_png: + type: File? + outputBinding: + glob: "*_raw_qc_mtrcs_dnst.png" + doc: | + Distribution of QC metrics per cell. + Unfiltered. + PNG format. + + raw_atacdbl_plot_png: + type: File? + outputBinding: + glob: "*_raw_atacdbl.png" + doc: | + Percentage of ATAC doublets. + Unfiltered. + PNG format. + + raw_tss_nrch_plot_png: + type: File? + outputBinding: + glob: "*_raw_tss_nrch.png" + doc: | + Signal enrichment around TSS. + Unfiltered; split by the minimum + TSS enrichment score threshold. + PNG format. + + raw_frgm_hist_png: + type: File? + outputBinding: + glob: "*_raw_frgm_hist.png" + doc: | + Histogram of ATAC fragment length. + Unfiltered; split by the maximum + nucleosome signal threshold. + PNG format. + + raw_frgm_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_raw_frgm_dnst_spl_cnd.png" + doc: | + Distribution of ATAC fragments in peaks + per cell. + Unfiltered; split by grouping condition. + PNG format. + + raw_peak_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_raw_peak_dnst_spl_cnd.png" + doc: | + Distribution of peaks per cell. + Unfiltered; split by grouping condition. + PNG format. + + raw_blck_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_raw_blck_dnst_spl_cnd.png" + doc: | + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Unfiltered; split by grouping condition. + PNG format. + + mid_fltr_1_2_qc_mtrcs_pca_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_1_2_qc_mtrcs_pca.png" + doc: | + QC metrics PCA. + Unfiltered, after MACS2 peak calling; + PC1/PC2. + PNG format. + + mid_fltr_2_3_qc_mtrcs_pca_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_2_3_qc_mtrcs_pca.png" + doc: | + QC metrics PCA. + Unfiltered, after MACS2 peak calling; + PC2/PC3. + PNG format. + + mid_fltr_cells_count_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_cells_count.png" + doc: | + Number of cells per dataset. + Unfiltered, after MACS2 peak calling. + PNG format. + + mid_fltr_frgm_dnst_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_frgm_dnst.png" + doc: | + Distribution of ATAC fragments in peaks + per cell. + Unfiltered, after MACS2 peak calling. + PNG format. + + mid_fltr_peak_dnst_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_peak_dnst.png" + doc: | + Distribution of peaks per cell. + Unfiltered, after MACS2 peak calling. + PNG format. + + mid_fltr_blck_dnst_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_blck_dnst.png" + doc: | + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Unfiltered, after MACS2 peak calling. + PNG format. + + mid_fltr_tss_frgm_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_tss_frgm.png" + doc: | + TSS enrichment score vs ATAC + fragments in peaks per cell. + Unfiltered, after MACS2 peak calling. + PNG format. + + mid_fltr_qc_mtrcs_dnst_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_qc_mtrcs_dnst.png" + doc: | + Distribution of QC metrics per cell. + Unfiltered, after MACS2 peak calling. + PNG format. + + mid_fltr_atacdbl_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_atacdbl.png" + doc: | + Percentage of ATAC doublets. + Unfiltered, after MACS2 peak calling. + PNG format. + + mid_fltr_tss_nrch_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_tss_nrch.png" + doc: | + Signal enrichment around TSS. + Unfiltered, after MACS2 peak calling; + split by the minimum TSS enrichment + score threshold. + PNG format. + + mid_fltr_frgm_hist_png: + type: File? + outputBinding: + glob: "*_mid_fltr_frgm_hist.png" + doc: | + Histogram of ATAC fragment length. + Unfiltered, after MACS2 peak calling; + split by the maximum nucleosome signal + threshold. + PNG format. + + mid_fltr_frgm_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_frgm_dnst_spl_cnd.png" + doc: | + Distribution of ATAC fragments in peaks + per cell. + Unfiltered, after MACS2 peak calling; + split by grouping condition. + PNG format. + + mid_fltr_peak_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_peak_dnst_spl_cnd.png" + doc: | + Distribution of peaks per cell. + Unfiltered, after MACS2 peak calling; + split by grouping condition. + PNG format. + + mid_fltr_blck_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_mid_fltr_blck_dnst_spl_cnd.png" + doc: | + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Unfiltered, after MACS2 peak calling; + split by grouping condition. + PNG format. + + fltr_1_2_qc_mtrcs_pca_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_1_2_qc_mtrcs_pca.png" + doc: | + QC metrics PCA. + Filtered; PC1/PC2. + PNG format. + + fltr_2_3_qc_mtrcs_pca_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_2_3_qc_mtrcs_pca.png" + doc: | + QC metrics PCA. + Filtered; PC2/PC3. + PNG format. + + fltr_cells_count_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_cells_count.png" + doc: | + Number of cells per dataset. + Filtered. + PNG format. + + fltr_frgm_dnst_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_frgm_dnst.png" + doc: | + Distribution of ATAC fragments in peaks + per cell. + Filtered. + PNG format. + + fltr_peak_dnst_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_peak_dnst.png" + doc: | + Distribution of peaks per cell. + Filtered. + PNG format. + + fltr_blck_dnst_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_blck_dnst.png" + doc: | + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Filtered. + PNG format. + + fltr_tss_frgm_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_tss_frgm.png" + doc: | + TSS enrichment score vs ATAC + fragments in peaks per cell. + Filtered. + PNG format. + + fltr_qc_mtrcs_dnst_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_qc_mtrcs_dnst.png" + doc: | + Distribution of QC metrics per cell. + Filtered. + PNG format. + + fltr_atacdbl_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_atacdbl.png" + doc: | + Percentage of ATAC doublets. + Filtered. + PNG format. + + fltr_tss_nrch_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_tss_nrch.png" + doc: | + Signal enrichment around TSS. + Filtered; split by the minimum + TSS enrichment score threshold. + PNG format. + + fltr_frgm_hist_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_frgm_hist.png" + doc: | + Histogram of ATAC fragment length. + Filtered; split by the maximum + nucleosome signal threshold. + PNG format. + + fltr_frgm_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_frgm_dnst_spl_cnd.png" + doc: | + Distribution of ATAC fragments in peaks + per cell. + Filtered; split by grouping condition. + PNG format. + + fltr_peak_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_peak_dnst_spl_cnd.png" + doc: | + Distribution of peaks per cell. + Filtered; split by grouping condition. + PNG format. + + fltr_blck_dnst_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*[!_mid]_fltr_blck_dnst_spl_cnd.png" + doc: | + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Filtered; split by grouping condition. + PNG format. + + all_plots_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*.pdf" + doc: | + All generated plots. + PDF format. + + ucsc_cb_config_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser" + doc: | + UCSC Cell Browser configuration data. + + ucsc_cb_html_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser/html_data" + doc: | + UCSC Cell Browser html data. + + ucsc_cb_html_file: + type: File? + outputBinding: + glob: "*_cellbrowser/html_data/index.html" + doc: | + UCSC Cell Browser html index. + + seurat_data_rds: + type: File + outputBinding: + glob: "*_data.rds" + doc: | + Seurat object. + RDS format + + datasets_metadata: + type: File + outputBinding: + glob: "*_meta.tsv" + doc: | + Example of datasets metadata file + in TSV format + + seurat_data_h5seurat: + type: File? + outputBinding: + glob: "*_data.h5seurat" + doc: | + Seurat object. + h5Seurat format + + seurat_data_h5ad: + type: File? + outputBinding: + glob: "*_counts.h5ad" + doc: | + Seurat object. + H5AD format + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["sc_atac_filter.R"] +arguments: +- valueFrom: | + ${ + if (inputs.aggregation_metadata) { + return inputs.aggregation_metadata; + } else { + return runtime.outdir + "/dummy_metadata.csv" + } + } + prefix: "--identity" + + +stdout: sc_atac_filter_stdout.log +stderr: sc_atac_filter_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Single-Cell ATAC-Seq Filtering Analysis" +s:name: "Single-Cell ATAC-Seq Filtering Analysis" +s:alternateName: "Single-Cell ATAC-Seq Filtering Analysis" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-atac-filter.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-Cell ATAC-Seq Filtering Analysis + + Removes low-quality cells from the outputs of either the + “Cell Ranger Count (ATAC)” or “Cell Ranger Aggregate (ATAC)” + pipeline. The results of this workflow are used in the + “Single-Cell ATAC-Seq Dimensionality Reduction Analysis” + pipeline. + + +s:about: | + usage: sc_atac_filter.R [-h] --mex MEX --identity IDENTITY + --fragments FRAGMENTS --annotations + ANNOTATIONS --seqinfo SEQINFO + [--grouping GROUPING] + [--blacklist BLACKLIST] + [--barcodes BARCODES] + [--atacmincells ATACMINCELLS] + [--minfragments [MINFRAGMENTS [MINFRAGMENTS ...]]] + [--maxnuclsignal [MAXNUCLSIGNAL [MAXNUCLSIGNAL ...]]] + [--mintssenrich [MINTSSENRICH [MINTSSENRICH ...]]] + [--minfrip MINFRIP] + [--maxblacklist [MAXBLACKLIST [MAXBLACKLIST ...]]] + [--callby CALLBY] [--qvalue QVALUE] + [--removedoublets] [--atacdbr ATACDBR] + [--atacdbrsd ATACDBRSD] [--pdf] + [--verbose] [--h5seurat] [--h5ad] + [--cbbuild] [--tmpdir TMPDIR] + [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + [--seed SEED] + + Single-Cell ATAC-Seq Filtering Analysis + + optional arguments: + -h, --help show this help message and exit + --mex MEX Path to the folder with feature-barcode matrix from + Cell Ranger Count (ATAC), Cell Ranger Count + (RNA+ATAC), Cell Ranger Aggregate (ATAC), or Cell + Ranger Aggregate (RNA+ATAC) experiment in MEX format. + For RNA+ATAC experiments the rows consisting genes + will be ignored. + --identity IDENTITY Path to the metadata TSV/CSV file to set the datasets + identities. If --mex points to the Cell Ranger + Aggregate (ATAC) or Cell Ranger Aggregate (RNA+ATAC) + outputs, the aggr.csv file can be used. If Cell Ranger + Count (ATAC) or Cell Ranger Count (RNA+ATAC) outputs + have been used in the --mex input, the file should + include at least one column - library_id and one row + with the alias for that experiment. + --fragments FRAGMENTS + Count and barcode information for every ATAC fragment + observed in the experiment in TSV format. Tbi-index + file is required. + --annotations ANNOTATIONS + Path to the genome annotation file in GTF format + --seqinfo SEQINFO Path to the headerless chromosome length file in TSV + format + --grouping GROUPING Path to the TSV/CSV file to define datasets grouping. + First column - 'library_id' with the values and order + that correspond to the 'library_id' column from the ' + --identity' file, second column 'condition'. Default: + each dataset is assigned to its own group. + --blacklist BLACKLIST + Path to the optional BED file with the genomic + blacklist regions. + --barcodes BARCODES Path to the TSV/CSV file to optionally prefilter and + extend Seurat object metadata be selected barcodes. + First column should be named as 'barcode'. If file + includes any other columns they will be added to the + Seurat object metadata ovewriting the existing ones if + those are present. Default: all cells used, no extra + metadata is added + --atacmincells ATACMINCELLS + Include only peaks detected in at least this many + cells. Default: 5 (applied to all datasets) + --minfragments [MINFRAGMENTS [MINFRAGMENTS ...]] + Include cells where at least this many ATAC fragments + in peaks are detected. If multiple values provided, + each of them will be applied to the correspondent + dataset from the '--mex' input based on the '-- + identity' file. Any 0 will be replaced with the auto- + estimated threshold (median - 2.5 * MAD) calculated + per dataset. Default: 1000 (applied to all datasets) + --maxnuclsignal [MAXNUCLSIGNAL [MAXNUCLSIGNAL ...]] + Include cells with the nucleosome signal not bigger + than this value. Nucleosome signal quantifies the + approximate ratio of mononucleosomal to nucleosome- + free ATAC fragments. If multiple values provided, each + of them will be applied to the correspondent dataset + from the '--mex' input based on the '--identity' file. + Default: 4 (applied to all datasets) + --mintssenrich [MINTSSENRICH [MINTSSENRICH ...]] + Include cells with the TSS enrichment score not lower + than this value. Score is calculated based on the + ratio of ATAC fragments centered at the TSS to ATAC + fragments in TSS-flanking regions. If multiple values + provided, each of them will be applied to the + correspondent dataset from the '--mex' input based on + the '--identity' file. Default: 2 (applied to all + datasets) + --minfrip MINFRIP Include cells with the FRiP not lower than this value. + FRiP is calculated for ATAC fragments. Default: 0.15 + (applied to all datasets) + --maxblacklist [MAXBLACKLIST [MAXBLACKLIST ...]] + Include cells with the fraction of ATAC fragments in + genomic blacklist regions not bigger than this value. + If multiple values provided, each of them will be + applied to the correspondent dataset from the '--mex' + input based on the '--identity' file. Default: 0.05 + (applied to all datasets) + --callby CALLBY Replace Cell Ranger peaks with MACS2 peaks called for + cells grouped by the column from the optionally + provided --barcodes file. If --barcodes file was not + provided MACS2 peaks can be still called per dataset + by setting --callby to new.ident. Peaks are called + only after applying maximum nucleosome signal and + minimum TSS enrichment scores filters. Default: do not + call peaks + --qvalue QVALUE Minimum FDR (q-value) cutoff for MACS2 peak detection. + Ignored if --callby is not provided. Default: 0.05 + --removedoublets Remove cells that were identified as doublets. + Default: do not remove doublets + --atacdbr ATACDBR Expected ATAC doublet rate. Default: 1 percent per + thousand cells captured with 10x genomics + --atacdbrsd ATACDBRSD + Uncertainty range in the ATAC doublet rate, + interpreted as a +/- around the value provided in + --atacdbr. Set to 0 to disable. Set to 1 to make the + threshold depend entirely on the misclassification + rate. Default: 40 percents of the value provided in + --atacdbr + --pdf Export plots in PDF. Default: false + --verbose Print debug information. Default: false + --h5seurat Save Seurat data to h5seurat file. Default: false + --h5ad Save raw counts from the ATAC assay to h5ad file. + Default: false + --cbbuild Export results to UCSC Cell Browser. Default: false + --tmpdir TMPDIR Directory to keep temporary files. Default: either + /tmp or defined by environment variables TMPDIR, TMP, + TEMP. + --output OUTPUT Output prefix. Default: ./sc + --theme {gray,bw,linedraw,light,dark,minimal,classic,void} + Color theme for all generated plots. Default: classic + --cpus CPUS Number of cores/cpus to use. Default: 1 + --memory MEMORY Maximum memory in GB allowed to be shared between the + workers when using multiple '--cpus'. Default: 32 + --seed SEED Seed number for random values. Default: 42 \ No newline at end of file diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index 8a5f3f25..9401da93 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.34 + dockerPull: biowardrobe2/sc-tools:v0.0.35 inputs: diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index f1778d6c..321a49df 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.34 + dockerPull: biowardrobe2/sc-tools:v0.0.35 inputs: diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index d8d37e00..add80d42 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.34 + dockerPull: biowardrobe2/sc-tools:v0.0.35 inputs: @@ -254,16 +254,12 @@ inputs: Default: 2 (applied to all datasets) minimum_frip: - type: - - "null" - - float - - float[] + type: float? inputBinding: prefix: "--minfrip" doc: | - Include cells with the FRiP not lower than this value. If multiple values - provided, each of them will be applied to the correspondent dataset from the - '--mex' input based on the '--identity' file. FRiP is calculated for ATAC fragments. + Include cells with the FRiP not lower than this + value. FRiP is calculated for ATAC fragments. Default: 0.15 (applied to all datasets) maximum_blacklist_fraction: @@ -465,1308 +461,784 @@ outputs: outputBinding: glob: "*_raw_1_2_qc_mtrcs_pca.png" doc: | - PC1 and PC2 from the QC metrics PCA (not filtered). - PNG format - - raw_1_2_qc_mtrcs_pca_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_1_2_qc_mtrcs_pca.pdf" - doc: | - PC1 and PC2 from the QC metrics PCA (not filtered). - PDF format + QC metrics PCA. + Unfiltered; PC1/PC2. + PNG format. raw_2_3_qc_mtrcs_pca_plot_png: type: File? outputBinding: glob: "*_raw_2_3_qc_mtrcs_pca.png" doc: | - PC2 and PC3 from the QC metrics PCA (not filtered). - PNG format - - raw_2_3_qc_mtrcs_pca_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_2_3_qc_mtrcs_pca.pdf" - doc: | - PC2 and PC3 from the QC metrics PCA (not filtered). - PDF format + QC metrics PCA. + Unfiltered; PC2/PC3. + PNG format. raw_cells_count_plot_png: type: File? outputBinding: glob: "*_raw_cells_count.png" doc: | - Number of cells per dataset (not filtered). - PNG format - - raw_cells_count_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_cells_count.pdf" - doc: | - Number of cells per dataset (not filtered). - PDF format + Number of cells per dataset. + Unfiltered. + PNG format. raw_umi_dnst_plot_png: type: File? outputBinding: glob: "*_raw_umi_dnst.png" doc: | - RNA reads per cell density (not filtered). - PNG format - - raw_umi_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_umi_dnst.pdf" - doc: | - RNA reads per cell density (not filtered). - PDF format + Distribution of RNA reads per cell. + Unfiltered. + PNG format. raw_gene_dnst_plot_png: type: File? outputBinding: glob: "*_raw_gene_dnst.png" doc: | - Genes per cell density (not filtered). - PNG format - - raw_gene_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_gene_dnst.pdf" - doc: | - Genes per cell density (not filtered). - PDF format + Distribution of genes per cell. + Unfiltered. + PNG format. raw_gene_umi_plot_png: type: File? outputBinding: glob: "*_raw_gene_umi.png" doc: | - Genes vs RNA reads per cell (not filtered). - PNG format - - raw_gene_umi_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_gene_umi.pdf" - doc: | - Genes vs RNA reads per cell (not filtered). - PDF format + Genes vs RNA reads per cell. + Unfiltered. + PNG format. raw_umi_mito_plot_png: type: File? outputBinding: glob: "*_raw_umi_mito.png" doc: | - RNA reads vs mitochondrial % per cell (not filtered). - PNG format - - raw_umi_mito_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_umi_mito.pdf" - doc: | - RNA reads vs mitochondrial % per cell (not filtered). - PDF format + RNA reads vs mitochondrial percentage + per cell. + Unfiltered. + PNG format. raw_mito_dnst_plot_png: type: File? outputBinding: glob: "*_raw_mito_dnst.png" doc: | - Percentage of RNA reads mapped to mitochondrial genes per cell density (not filtered). - PNG format - - raw_mito_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_mito_dnst.pdf" - doc: | - Percentage of RNA reads mapped to mitochondrial genes per cell density (not filtered). - PDF format + Distribution of RNA reads mapped + to mitochondrial genes per cell. + Unfiltered. + PNG format. raw_nvlt_dnst_plot_png: type: File? outputBinding: glob: "*_raw_nvlt_dnst.png" doc: | - Novelty score per cell density for RNA assay (not filtered). - PNG format - - raw_nvlt_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_nvlt_dnst.pdf" - doc: | - Novelty score per cell density for RNA assay (not filtered). - PDF format + Distribution of novelty score per cell. + Unfiltered. + PNG format. raw_frgm_dnst_plot_png: type: File? outputBinding: glob: "*_raw_frgm_dnst.png" doc: | - ATAC fragments in peaks per cell density (not filtered). - PNG format - - raw_frgm_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_frgm_dnst.pdf" - doc: | - ATAC fragments in peaks per cell density (not filtered). - PDF format + Distribution of ATAC fragments in peaks + per cell. + Unfiltered. + PNG format. raw_peak_dnst_plot_png: type: File? outputBinding: glob: "*_raw_peak_dnst.png" doc: | - Peaks per cell density (not filtered). - PNG format - - raw_peak_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_peak_dnst.pdf" - doc: | - Peaks per cell density (not filtered). - PDF format + Distribution of peaks per cell. + Unfiltered. + PNG format. raw_blck_dnst_plot_png: type: File? outputBinding: glob: "*_raw_blck_dnst.png" doc: | - Fraction of ATAC fragments within genomic blacklist regions per cell density (not filtered). - PNG format - - raw_blck_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_blck_dnst.pdf" - doc: | - Fraction of ATAC fragments within genomic blacklist regions per cell density (not filtered). - PDF format + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Unfiltered. + PNG format. raw_rna_atac_cnts_plot_png: type: File? outputBinding: glob: "*_raw_rna_atac_cnts.png" doc: | - RNA reads vs ATAC fragments in peaks per cell (not filtered). - PNG format - - raw_rna_atac_cnts_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_rna_atac_cnts.pdf" - doc: | - RNA reads vs ATAC fragments in peaks per cell (not filtered). - PDF format + RNA reads vs ATAC fragments + in peaks per cell. + Unfiltered. + PNG format. raw_tss_frgm_plot_png: type: File? outputBinding: glob: "*_raw_tss_frgm.png" doc: | - TSS enrichment score vs ATAC fragments in peaks per cell (not filtered). - PNG format - - raw_tss_frgm_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_tss_frgm.pdf" - doc: | - TSS enrichment score vs ATAC fragments in peaks per cell (not filtered). - PDF format + TSS enrichment score vs ATAC + fragments in peaks per cell. + Unfiltered. + PNG format. raw_qc_mtrcs_dnst_plot_png: type: File? outputBinding: glob: "*_raw_qc_mtrcs_dnst.png" doc: | - QC metrics per cell density (not filtered). - PNG format - - raw_qc_mtrcs_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_qc_mtrcs_dnst.pdf" - doc: | - QC metrics per cell density (not filtered). - PDF format + Distribution of QC metrics per cell. + Unfiltered. + PNG format. raw_rnadbl_plot_png: type: File? outputBinding: glob: "*_raw_rnadbl.png" doc: | - Percentage of RNA doublets per dataset (not filtered). - PNG format - - raw_rnadbl_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_rnadbl.pdf" - doc: | - Percentage of RNA doublets per dataset (not filtered). - PDF format + Percentage of RNA doublets. + Unfiltered. + PNG format. raw_atacdbl_plot_png: type: File? outputBinding: glob: "*_raw_atacdbl.png" doc: | - Percentage of ATAC doublets per dataset (not filtered). - PNG format - - raw_atacdbl_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_atacdbl.pdf" - doc: | - Percentage of ATAC doublets per dataset (not filtered). - PDF format + Percentage of ATAC doublets. + Unfiltered. + PNG format. raw_vrlpdbl_plot_png: type: File? outputBinding: glob: "*_raw_vrlpdbl.png" doc: | - Doublets overlap for RNA and ATAC assays per dataset (not filtered). - PNG format - - raw_vrlpdbl_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_vrlpdbl.pdf" - doc: | - Doublets overlap for RNA and ATAC assays per dataset (not filtered). - PDF format + Percentage of RNA and ATAC doublets. + Unfiltered. + PNG format. raw_tss_nrch_plot_png: type: File? outputBinding: glob: "*_raw_tss_nrch.png" doc: | - TSS enrichment score (not filtered). - PNG format - - raw_tss_nrch_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_tss_nrch.pdf" - doc: | - TSS enrichment score (not filtered). - PDF format + Signal enrichment around TSS. + Unfiltered; split by the minimum + TSS enrichment score threshold. + PNG format. raw_frgm_hist_png: type: File? outputBinding: glob: "*_raw_frgm_hist.png" doc: | - ATAC fragments length histogram (not filtered). - PNG format - - raw_frgm_hist_pdf: - type: File? - outputBinding: - glob: "*_raw_frgm_hist.pdf" - doc: | - ATAC fragments length histogram (not filtered). - PDF format + Histogram of ATAC fragment length. + Unfiltered; split by the maximum + nucleosome signal threshold. + PNG format. raw_umi_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_raw_umi_dnst_spl_cnd.png" doc: | - Split by grouping condition RNA reads per cell density (not filtered). - PNG format - - raw_umi_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_umi_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition RNA reads per cell density (not filtered). - PDF format + Distribution of RNA reads per cell. + Unfiltered; split by grouping condition. + PNG format. raw_gene_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_raw_gene_dnst_spl_cnd.png" doc: | - Split by grouping condition genes per cell density (not filtered). - PNG format - - raw_gene_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_gene_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition genes per cell density (not filtered). - PDF format + Distribution of genes per cell. + Unfiltered; split by grouping condition. + PNG format. raw_mito_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_raw_mito_dnst_spl_cnd.png" doc: | - Split by grouping condition the percentage of RNA reads mapped - to mitochondrial genes per cell density (not filtered). - PNG format - - raw_mito_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_mito_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition the percentage of RNA reads mapped - to mitochondrial genes per cell density (not filtered). - PDF format + Distribution of RNA reads mapped + to mitochondrial genes per cell. + Unfiltered; split by grouping condition. + PNG format. raw_nvlt_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_raw_nvlt_dnst_spl_cnd.png" doc: | - Split by grouping condition the novelty score per cell density for RNA assay (not filtered). - PNG format - - raw_nvlt_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_nvlt_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition the novelty score per cell density for RNA assay (not filtered). - PDF format + Distribution of novelty score per cell. + Unfiltered; split by grouping condition. + PNG format. raw_frgm_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_raw_frgm_dnst_spl_cnd.png" doc: | - Split by grouping condition ATAC fragments in peaks per cell density (not filtered). - PNG format - - raw_frgm_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_frgm_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition ATAC fragments in peaks per cell density (not filtered). - PDF format + Distribution of ATAC fragments in peaks + per cell. + Unfiltered; split by grouping condition. + PNG format. raw_peak_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_raw_peak_dnst_spl_cnd.png" doc: | - Split by grouping condition peaks per cell density (not filtered). - PNG format - - raw_peak_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_peak_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition peaks per cell density (not filtered). - PDF format + Distribution of peaks per cell. + Unfiltered; split by grouping condition. + PNG format. raw_blck_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_raw_blck_dnst_spl_cnd.png" doc: | - Split by grouping condition the fraction of ATAC fragments within genomic - blacklist regions per cell density (not filtered). - PNG format - - raw_blck_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_blck_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition the fraction of ATAC fragments within genomic - blacklist regions per cell density (not filtered). - PDF format + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Unfiltered; split by grouping condition. + PNG format. mid_fltr_1_2_qc_mtrcs_pca_plot_png: type: File? outputBinding: glob: "*_mid_fltr_1_2_qc_mtrcs_pca.png" doc: | - PC1 and PC2 from the QC metrics PCA (intermediate filtered). - PNG format - - mid_fltr_1_2_qc_mtrcs_pca_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_1_2_qc_mtrcs_pca.pdf" - doc: | - PC1 and PC2 from the QC metrics PCA (intermediate filtered). - PDF format + QC metrics PCA. + Unfiltered, after MACS2 peak calling; + PC1/PC2. + PNG format. mid_fltr_2_3_qc_mtrcs_pca_plot_png: type: File? outputBinding: glob: "*_mid_fltr_2_3_qc_mtrcs_pca.png" doc: | - PC2 and PC3 from the QC metrics PCA (intermediate filtered). - PNG format - - mid_fltr_2_3_qc_mtrcs_pca_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_2_3_qc_mtrcs_pca.pdf" - doc: | - PC2 and PC3 from the QC metrics PCA (intermediate filtered). - PDF format + QC metrics PCA. + Unfiltered, after MACS2 peak calling; + PC2/PC3. + PNG format. mid_fltr_cells_count_plot_png: type: File? outputBinding: glob: "*_mid_fltr_cells_count.png" doc: | - Number of cells per dataset (intermediate filtered). - PNG format - - mid_fltr_cells_count_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_cells_count.pdf" - doc: | - Number of cells per dataset (intermediate filtered). - PDF format + Number of cells per dataset. + Unfiltered, after MACS2 peak calling. + PNG format. mid_fltr_umi_dnst_plot_png: type: File? outputBinding: glob: "*_mid_fltr_umi_dnst.png" doc: | - RNA reads per cell density (intermediate filtered). - PNG format - - mid_fltr_umi_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_umi_dnst.pdf" - doc: | - RNA reads per cell density (intermediate filtered). - PDF format + Distribution of RNA reads per cell. + Unfiltered, after MACS2 peak calling. + PNG format. mid_fltr_gene_dnst_plot_png: type: File? outputBinding: glob: "*_mid_fltr_gene_dnst.png" doc: | - Genes per cell density (intermediate filtered). - PNG format - - mid_fltr_gene_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_gene_dnst.pdf" - doc: | - Genes per cell density (intermediate filtered). - PDF format + Distribution of genes per cell. + Unfiltered, after MACS2 peak calling. + PNG format. mid_fltr_gene_umi_plot_png: type: File? outputBinding: glob: "*_mid_fltr_gene_umi.png" doc: | - Genes vs RNA reads per cell (intermediate filtered). - PNG format - - mid_fltr_gene_umi_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_gene_umi.pdf" - doc: | - Genes vs RNA reads per cell (intermediate filtered). - PDF format - - mid_umi_mito_plot_png: - type: File? - outputBinding: - glob: "*_mid_umi_mito.png" - doc: | - RNA reads vs mitochondrial % per cell (intermediate filtered). - PNG format + Genes vs RNA reads per cell. + Unfiltered, after MACS2 peak calling. + PNG format. - mid_umi_mito_plot_pdf: + mid_fltr_umi_mito_plot_png: type: File? outputBinding: - glob: "*_mid_umi_mito.pdf" + glob: "*_mid_fltr_umi_mito.png" doc: | - RNA reads vs mitochondrial % per cell (intermediate filtered). - PDF format + RNA reads vs mitochondrial percentage + per cell. + Unfiltered, after MACS2 peak calling. + PNG format. mid_fltr_mito_dnst_plot_png: type: File? outputBinding: glob: "*_mid_fltr_mito_dnst.png" doc: | - Percentage of RNA reads mapped to mitochondrial genes per cell density (intermediate filtered). - PNG format - - mid_fltr_mito_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_mito_dnst.pdf" - doc: | - Percentage of RNA reads mapped to mitochondrial genes per cell density (intermediate filtered). - PDF format + Distribution of RNA reads mapped + to mitochondrial genes per cell. + Unfiltered, after MACS2 peak calling. + PNG format. mid_fltr_nvlt_dnst_plot_png: type: File? outputBinding: glob: "*_mid_fltr_nvlt_dnst.png" doc: | - Novelty score per cell density for RNA assay (intermediate filtered). - PNG format - - mid_fltr_nvlt_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_nvlt_dnst.pdf" - doc: | - Novelty score per cell density for RNA assay (intermediate filtered). - PDF format + Distribution of novelty score per cell. + Unfiltered, after MACS2 peak calling. + PNG format. mid_fltr_frgm_dnst_plot_png: type: File? outputBinding: glob: "*_mid_fltr_frgm_dnst.png" doc: | - ATAC fragments in peaks per cell density (intermediate filtered). - PNG format - - mid_fltr_frgm_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_frgm_dnst.pdf" - doc: | - ATAC fragments in peaks per cell density (intermediate filtered). - PDF format + Distribution of ATAC fragments in peaks + per cell. + Unfiltered, after MACS2 peak calling. + PNG format. mid_fltr_peak_dnst_plot_png: type: File? outputBinding: glob: "*_mid_fltr_peak_dnst.png" doc: | - Peaks per cell density (intermediate filtered). - PNG format - - mid_fltr_peak_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_peak_dnst.pdf" - doc: | - Peaks per cell density (intermediate filtered). - PDF format + Distribution of peaks per cell. + Unfiltered, after MACS2 peak calling. + PNG format. mid_fltr_blck_dnst_plot_png: type: File? outputBinding: glob: "*_mid_fltr_blck_dnst.png" doc: | - Fraction of ATAC fragments within genomic blacklist regions per cell density (intermediate filtered). - PNG format - - mid_fltr_blck_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_blck_dnst.pdf" - doc: | - Fraction of ATAC fragments within genomic blacklist regions per cell density (intermediate filtered). - PDF format + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Unfiltered, after MACS2 peak calling. + PNG format. mid_fltr_rna_atac_cnts_plot_png: type: File? outputBinding: glob: "*_mid_fltr_rna_atac_cnts.png" doc: | - RNA reads vs ATAC fragments in peaks per cell (intermediate filtered). - PNG format - - mid_fltr_rna_atac_cnts_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_rna_atac_cnts.pdf" - doc: | - RNA reads vs ATAC fragments in peaks per cell (intermediate filtered). - PDF format + RNA reads vs ATAC fragments + in peaks per cell. + Unfiltered, after MACS2 peak calling. + PNG format. mid_fltr_tss_frgm_plot_png: type: File? outputBinding: glob: "*_mid_fltr_tss_frgm.png" doc: | - TSS enrichment score vs ATAC fragments in peaks per cell (intermediate filtered). - PNG format - - mid_fltr_tss_frgm_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_tss_frgm.pdf" - doc: | - TSS enrichment score vs ATAC fragments in peaks per cell (intermediate filtered). - PDF format + TSS enrichment score vs ATAC + fragments in peaks per cell. + Unfiltered, after MACS2 peak calling. + PNG format. mid_fltr_qc_mtrcs_dnst_plot_png: type: File? outputBinding: glob: "*_mid_fltr_qc_mtrcs_dnst.png" doc: | - QC metrics per cell density (intermediate filtered). - PNG format - - mid_fltr_qc_mtrcs_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_qc_mtrcs_dnst.pdf" - doc: | - QC metrics per cell density (intermediate filtered). - PDF format + Distribution of QC metrics per cell. + Unfiltered, after MACS2 peak calling. + PNG format. mid_fltr_rnadbl_plot_png: type: File? outputBinding: glob: "*_mid_fltr_rnadbl.png" doc: | - Percentage of RNA doublets per dataset (intermediate filtered). - PNG format - - mid_fltr_rnadbl_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_rnadbl.pdf" - doc: | - Percentage of RNA doublets per dataset (intermediate filtered). - PDF format + Percentage of RNA doublets. + Unfiltered, after MACS2 peak calling. + PNG format. mid_fltr_atacdbl_plot_png: type: File? outputBinding: glob: "*_mid_fltr_atacdbl.png" doc: | - Percentage of ATAC doublets per dataset (intermediate filtered). - PNG format - - mid_fltr_atacdbl_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_atacdbl.pdf" - doc: | - Percentage of ATAC doublets per dataset (intermediate filtered). - PDF format + Percentage of ATAC doublets. + Unfiltered, after MACS2 peak calling. + PNG format. mid_fltr_vrlpdbl_plot_png: type: File? outputBinding: glob: "*_mid_fltr_vrlpdbl.png" doc: | - Doublets overlap for RNA and ATAC assays per dataset (intermediate filtered). - PNG format - - mid_fltr_vrlpdbl_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_vrlpdbl.pdf" - doc: | - Doublets overlap for RNA and ATAC assays per dataset (intermediate filtered). - PDF format + Percentage of RNA and ATAC doublets. + Unfiltered, after MACS2 peak calling. + PNG format. mid_fltr_tss_nrch_plot_png: type: File? outputBinding: glob: "*_mid_fltr_tss_nrch.png" doc: | - TSS enrichment score (intermediate filtered). - PNG format - - mid_fltr_tss_nrch_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_tss_nrch.pdf" - doc: | - TSS enrichment score (intermediate filtered). - PDF format + Signal enrichment around TSS. + Unfiltered, after MACS2 peak calling; + split by the minimum TSS enrichment + score threshold. + PNG format. mid_fltr_frgm_hist_png: type: File? outputBinding: glob: "*_mid_fltr_frgm_hist.png" doc: | - ATAC fragments length histogram (intermediate filtered). - PNG format - - mid_fltr_frgm_hist_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_frgm_hist.pdf" - doc: | - ATAC fragments length histogram (intermediate filtered). - PDF format + Histogram of ATAC fragment length. + Unfiltered, after MACS2 peak calling; + split by the maximum nucleosome signal + threshold. + PNG format. mid_fltr_umi_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_mid_fltr_umi_dnst_spl_cnd.png" doc: | - Split by grouping condition RNA reads per cell density (intermediate filtered). - PNG format - - mid_fltr_umi_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_umi_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition RNA reads per cell density (intermediate filtered). - PDF format + Distribution of RNA reads per cell. + Unfiltered, after MACS2 peak calling; + split by grouping condition. + PNG format. mid_fltr_gene_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_mid_fltr_gene_dnst_spl_cnd.png" doc: | - Split by grouping condition genes per cell density (intermediate filtered). - PNG format - - mid_fltr_gene_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_gene_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition genes per cell density (intermediate filtered). - PDF format + Distribution of genes per cell. + Unfiltered, after MACS2 peak calling; + split by grouping condition. + PNG format. mid_fltr_mito_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_mid_fltr_mito_dnst_spl_cnd.png" doc: | - Split by grouping condition the percentage of RNA reads mapped - to mitochondrial genes per cell density (intermediate filtered). - PNG format - - mid_fltr_mito_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_mito_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition the percentage of RNA reads mapped - to mitochondrial genes per cell density (intermediate filtered). - PDF format + Distribution of RNA reads mapped + to mitochondrial genes per cell. + Unfiltered, after MACS2 peak calling; + split by grouping condition. + PNG format. mid_fltr_nvlt_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_mid_fltr_nvlt_dnst_spl_cnd.png" doc: | - Split by grouping condition the novelty score per cell density for RNA assay (intermediate filtered). - PNG format - - mid_fltr_nvlt_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_nvlt_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition the novelty score per cell density for RNA assay (intermediate filtered). - PDF format + Distribution of novelty score per cell. + Unfiltered, after MACS2 peak calling; + split by grouping condition. + PNG format. mid_fltr_frgm_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_mid_fltr_frgm_dnst_spl_cnd.png" doc: | - Split by grouping condition ATAC fragments in peaks per cell density (intermediate filtered). - PNG format - - mid_fltr_frgm_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_frgm_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition ATAC fragments in peaks per cell density (intermediate filtered). - PDF format + Distribution of ATAC fragments in peaks + per cell. + Unfiltered, after MACS2 peak calling; + split by grouping condition. + PNG format. mid_fltr_peak_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_mid_fltr_peak_dnst_spl_cnd.png" doc: | - Split by grouping condition peaks per cell density (intermediate filtered). - PNG format - - mid_fltr_peak_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_peak_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition peaks per cell density (intermediate filtered). - PDF format + Distribution of peaks per cell. + Unfiltered, after MACS2 peak calling; + split by grouping condition. + PNG format. mid_fltr_blck_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_mid_fltr_blck_dnst_spl_cnd.png" doc: | - Split by grouping condition the fraction of ATAC fragments within genomic - blacklist regions per cell density (intermediate filtered). - PNG format - - mid_fltr_blck_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_mid_fltr_blck_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition the fraction of ATAC fragments within genomic - blacklist regions per cell density (intermediate filtered). - PDF format + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Unfiltered, after MACS2 peak calling; + split by grouping condition. + PNG format. fltr_1_2_qc_mtrcs_pca_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_1_2_qc_mtrcs_pca.png" doc: | - PC1 and PC2 from the QC metrics PCA (filtered). - PNG format - - fltr_1_2_qc_mtrcs_pca_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_1_2_qc_mtrcs_pca.pdf" - doc: | - PC1 and PC2 from the QC metrics PCA (filtered). - PDF format + QC metrics PCA. + Filtered; PC1/PC2. + PNG format. fltr_2_3_qc_mtrcs_pca_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_2_3_qc_mtrcs_pca.png" doc: | - PC2 and PC3 from the QC metrics PCA (filtered). - PNG format - - fltr_2_3_qc_mtrcs_pca_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_2_3_qc_mtrcs_pca.pdf" - doc: | - PC2 and PC3 from the QC metrics PCA (filtered). - PDF format + QC metrics PCA. + Filtered; PC2/PC3. + PNG format. fltr_cells_count_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_cells_count.png" doc: | - Number of cells per dataset (filtered). - PNG format - - fltr_cells_count_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_cells_count.pdf" - doc: | - Number of cells per dataset (filtered). - PDF format + Number of cells per dataset. + Filtered. + PNG format. fltr_umi_dnst_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_umi_dnst.png" doc: | - RNA reads per cell density (filtered). - PNG format - - fltr_umi_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_umi_dnst.pdf" - doc: | - RNA reads per cell density (filtered). - PDF format + Distribution of RNA reads per cell. + Filtered. + PNG format. fltr_gene_dnst_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_gene_dnst.png" doc: | - Genes per cell density (filtered). - PNG format - - fltr_gene_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_gene_dnst.pdf" - doc: | - Genes per cell density (filtered). - PDF format + Distribution of genes per cell. + Filtered. + PNG format. fltr_gene_umi_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_gene_umi.png" doc: | - Genes vs RNA reads per cell (filtered). - PNG format - - fltr_gene_umi_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_gene_umi.pdf" - doc: | - Genes vs RNA reads per cell (filtered). - PDF format + Genes vs RNA reads per cell. + Filtered. + PNG format. fltr_umi_mito_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_umi_mito.png" doc: | - RNA reads vs mitochondrial % per cell (filtered). - PNG format - - fltr_umi_mito_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_umi_mito.pdf" - doc: | - RNA reads vs mitochondrial % per cell (filtered). - PDF format + RNA reads vs mitochondrial percentage + per cell. + Filtered. + PNG format. fltr_mito_dnst_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_mito_dnst.png" doc: | - Percentage of RNA reads mapped to mitochondrial genes per cell density (filtered). - PNG format - - fltr_mito_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_mito_dnst.pdf" - doc: | - Percentage of RNA reads mapped to mitochondrial genes per cell density (filtered). - PDF format + Distribution of RNA reads mapped + to mitochondrial genes per cell. + Filtered. + PNG format. fltr_nvlt_dnst_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_nvlt_dnst.png" doc: | - Novelty score per cell density for RNA assay (filtered). - PNG format - - fltr_nvlt_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_nvlt_dnst.pdf" - doc: | - Novelty score per cell density for RNA assay (filtered). - PDF format + Distribution of novelty score per cell. + Filtered. + PNG format. fltr_frgm_dnst_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_frgm_dnst.png" doc: | - ATAC fragments in peaks per cell density (filtered). - PNG format - - fltr_frgm_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_frgm_dnst.pdf" - doc: | - ATAC fragments in peaks per cell density (filtered). - PDF format + Distribution of ATAC fragments in peaks + per cell. + Filtered. + PNG format. fltr_peak_dnst_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_peak_dnst.png" doc: | - Peaks per cell density (filtered). - PNG format - - fltr_peak_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_peak_dnst.pdf" - doc: | - Peaks per cell density (filtered). - PDF format + Distribution of peaks per cell. + Filtered. + PNG format. fltr_blck_dnst_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_blck_dnst.png" doc: | - Fraction of ATAC fragments within genomic blacklist regions per cell density (filtered). - PNG format - - fltr_blck_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_blck_dnst.pdf" - doc: | - Fraction of ATAC fragments within genomic blacklist regions per cell density (filtered). - PDF format + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Filtered. + PNG format. fltr_rna_atac_cnts_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_rna_atac_cnts.png" doc: | - RNA reads vs ATAC fragments in peaks per cell (filtered). - PNG format + RNA reads vs ATAC fragments + in peaks per cell. + Filtered. + PNG format. - fltr_rna_atac_cnts_plot_pdf: + fltr_tss_frgm_plot_png: type: File? outputBinding: - glob: "*[!_mid]_fltr_rna_atac_cnts.pdf" + glob: "*[!_mid]_fltr_tss_frgm.png" doc: | - RNA reads vs ATAC fragments in peaks per cell (filtered). - PDF format + TSS enrichment score vs ATAC + fragments in peaks per cell. + Filtered. + PNG format. - fltr_rnadbl_plot_png: + fltr_qc_mtrcs_dnst_plot_png: type: File? outputBinding: - glob: "*[!_mid]_fltr_rnadbl.png" + glob: "*[!_mid]_fltr_qc_mtrcs_dnst.png" doc: | - Percentage of RNA doublets per dataset (filtered). - PNG format + Distribution of QC metrics per cell. + Filtered. + PNG format. - fltr_rnadbl_plot_pdf: + fltr_rnadbl_plot_png: type: File? outputBinding: - glob: "*[!_mid]_fltr_rnadbl.pdf" + glob: "*[!_mid]_fltr_rnadbl.png" doc: | - Percentage of RNA doublets per dataset (filtered). - PDF format + Percentage of RNA doublets. + Filtered. + PNG format. fltr_atacdbl_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_atacdbl.png" doc: | - Percentage of ATAC doublets per dataset (filtered). - PNG format - - fltr_atacdbl_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_atacdbl.pdf" - doc: | - Percentage of ATAC doublets per dataset (filtered). - PDF format + Percentage of ATAC doublets. + Filtered. + PNG format. fltr_vrlpdbl_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_vrlpdbl.png" doc: | - Doublets overlap for RNA and ATAC assays per dataset (filtered). - PNG format - - fltr_vrlpdbl_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_vrlpdbl.pdf" - doc: | - Doublets overlap for RNA and ATAC assays per dataset (filtered). - PDF format - - fltr_tss_frgm_plot_png: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_tss_frgm.png" - doc: | - TSS enrichment score vs ATAC fragments in peaks per cell (filtered). - PNG format - - fltr_tss_frgm_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_tss_frgm.pdf" - doc: | - TSS enrichment score vs ATAC fragments in peaks per cell (filtered). - PDF format - - fltr_qc_mtrcs_dnst_plot_png: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_qc_mtrcs_dnst.png" - doc: | - QC metrics per cell density (filtered). - PNG format - - fltr_qc_mtrcs_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_qc_mtrcs_dnst.pdf" - doc: | - QC metrics per cell density (filtered). - PDF format + Percentage of RNA and ATAC doublets. + Filtered. + PNG format. fltr_tss_nrch_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_tss_nrch.png" doc: | - TSS enrichment score (filtered). - PNG format - - fltr_tss_nrch_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_tss_nrch.pdf" - doc: | - TSS enrichment score (filtered). - PDF format + Signal enrichment around TSS. + Filtered; split by the minimum + TSS enrichment score threshold. + PNG format. fltr_frgm_hist_png: type: File? outputBinding: glob: "*[!_mid]_fltr_frgm_hist.png" doc: | - ATAC fragments length histogram (filtered). - PNG format - - fltr_frgm_hist_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_frgm_hist.pdf" - doc: | - ATAC fragments length histogram (filtered). - PDF format + Histogram of ATAC fragment length. + Filtered; split by the maximum + nucleosome signal threshold. + PNG format. fltr_umi_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_umi_dnst_spl_cnd.png" doc: | - Split by grouping condition RNA reads per cell density (filtered). - PNG format - - fltr_umi_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_umi_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition RNA reads per cell density (filtered). - PDF format + Distribution of RNA reads per cell. + Filtered; split by grouping condition. + PNG format. fltr_gene_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_gene_dnst_spl_cnd.png" doc: | - Split by grouping condition genes per cell density (filtered). - PNG format - - fltr_gene_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_gene_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition genes per cell density (filtered). - PDF format + Distribution of genes per cell. + Filtered; split by grouping condition. + PNG format. fltr_mito_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_mito_dnst_spl_cnd.png" doc: | - Split by grouping condition the percentage of RNA reads mapped - to mitochondrial genes per cell density (filtered). - PNG format - - fltr_mito_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_mito_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition the percentage of RNA reads mapped - to mitochondrial genes per cell density (filtered). - PDF format + Distribution of RNA reads mapped + to mitochondrial genes per cell. + Filtered; split by grouping condition. + PNG format. fltr_nvlt_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_nvlt_dnst_spl_cnd.png" doc: | - Split by grouping condition the novelty score per cell density for RNA assay (filtered). - PNG format - - fltr_nvlt_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_nvlt_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition the novelty score per cell density for RNA assay (filtered). - PDF format + Distribution of novelty score per cell. + Filtered; split by grouping condition. + PNG format. fltr_frgm_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_frgm_dnst_spl_cnd.png" doc: | - Split by grouping condition ATAC fragments in peaks per cell density (filtered). - PNG format - - fltr_frgm_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_frgm_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition ATAC fragments in peaks per cell density (filtered). - PDF format + Distribution of ATAC fragments in peaks + per cell. + Filtered; split by grouping condition. + PNG format. fltr_peak_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_peak_dnst_spl_cnd.png" doc: | - Split by grouping condition peaks per cell density (filtered). - PNG format - - fltr_peak_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*[!_mid]_fltr_peak_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition peaks per cell density (filtered). - PDF format + Distribution of peaks per cell. + Filtered; split by grouping condition. + PNG format. fltr_blck_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*[!_mid]_fltr_blck_dnst_spl_cnd.png" doc: | - Split by grouping condition the fraction of ATAC fragments within genomic - blacklist regions per cell density (filtered). - PNG format + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Filtered; split by grouping condition. + PNG format. - fltr_blck_dnst_spl_cnd_plot_pdf: - type: File? + all_plots_pdf: + type: + - "null" + - type: array + items: File outputBinding: - glob: "*[!_mid]_fltr_blck_dnst_spl_cnd.pdf" + glob: "*.pdf" doc: | - Split by grouping condition the fraction of ATAC fragments within genomic - blacklist regions per cell density (filtered). - PDF format + All generated plots. + PDF format. ucsc_cb_config_data: type: Directory? @@ -1864,7 +1336,7 @@ $schemas: label: "Single-Cell Multiome ATAC-Seq and RNA-Seq Filtering Analysis" s:name: "Single-Cell Multiome ATAC-Seq and RNA-Seq Filtering Analysis" -s:alternateName: "Filters single-cell multiome ATAC and RNA-Seq datasets based on the common QC metrics" +s:alternateName: "Single-Cell Multiome ATAC-Seq and RNA-Seq Filtering Analysis" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-multiome-filter.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -1904,11 +1376,15 @@ s:creator: doc: | Single-Cell Multiome ATAC-Seq and RNA-Seq Filtering Analysis - Filters single-cell multiome ATAC and RNA-Seq datasets based on the common QC metrics. + Removes low-quality cells from the outputs of the “Cell Ranger Count + (RNA+ATAC)” and “Cell Ranger Aggregate (RNA+ATAC)” pipelines. The + results of this workflow are used in the “Single-Cell RNA-Seq + Dimensionality Reduction Analysis” and “Single-Cell ATAC-Seq + Dimensionality Reduction Analysis” pipelines. s:about: | - usage: /usr/local/bin/sc_multiome_filter.R [-h] --mex MEX --identity IDENTITY + usage: sc_multiome_filter.R [-h] --mex MEX --identity IDENTITY --fragments FRAGMENTS --annotations ANNOTATIONS --seqinfo SEQINFO [--grouping GROUPING] @@ -1925,7 +1401,7 @@ s:about: | [--minfragments [MINFRAGMENTS [MINFRAGMENTS ...]]] [--maxnuclsignal [MAXNUCLSIGNAL [MAXNUCLSIGNAL ...]]] [--mintssenrich [MINTSSENRICH [MINTSSENRICH ...]]] - [--minfrip [MINFRIP [MINFRIP ...]]] + [--minfrip MINFRIP] [--maxblacklist [MAXBLACKLIST [MAXBLACKLIST ...]]] [--callby CALLBY] [--qvalue QVALUE] [--removedoublets {union,onlyrna,onlyatac,intersect}] @@ -2049,13 +1525,9 @@ s:about: | correspondent dataset from the '--mex' input based on the '--identity' file. Default: 2 (applied to all datasets) - --minfrip [MINFRIP [MINFRIP ...]] - Include cells with the FRiP not lower than this value. - If multiple values provided, each of them will be - applied to the correspondent dataset from the '--mex' - input based on the '--identity' file. FRiP is - calculated for ATAC fragments. Default: 0.15 (applied - to all datasets) + --minfrip MINFRIP Include cells with the FRiP not lower than this value. + FRiP is calculated for ATAC fragments. Default: 0.15 + (applied to all datasets) --maxblacklist [MAXBLACKLIST [MAXBLACKLIST ...]] Include cells with the fraction of ATAC fragments in genomic blacklist regions not bigger than this value. diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index 9186f47b..bc4fae12 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.34 + dockerPull: biowardrobe2/sc-tools:v0.0.35 inputs: diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index e4ce986a..de4af447 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.34 + dockerPull: biowardrobe2/sc-tools:v0.0.35 inputs: diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index 3dc85ae0..51a1ba31 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.34 + dockerPull: biowardrobe2/sc-tools:v0.0.35 inputs: diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index 40ea5934..33d5506e 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.34 + dockerPull: biowardrobe2/sc-tools:v0.0.35 inputs: @@ -280,484 +280,287 @@ outputs: outputBinding: glob: "*_raw_1_2_qc_mtrcs_pca.png" doc: | - PC1 and PC2 from the QC metrics PCA (not filtered). - PNG format - - raw_1_2_qc_mtrcs_pca_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_1_2_qc_mtrcs_pca.pdf" - doc: | - PC1 and PC2 from the QC metrics PCA (not filtered). - PDF format + QC metrics PCA. + Unfiltered; PC1/PC2. + PNG format. raw_2_3_qc_mtrcs_pca_plot_png: type: File? outputBinding: glob: "*_raw_2_3_qc_mtrcs_pca.png" doc: | - PC2 and PC3 from the QC metrics PCA (not filtered). - PNG format - - raw_2_3_qc_mtrcs_pca_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_2_3_qc_mtrcs_pca.pdf" - doc: | - PC2 and PC3 from the QC metrics PCA (not filtered). - PDF format + QC metrics PCA. + Unfiltered; PC2/PC3. + PNG format. raw_cells_count_plot_png: type: File? outputBinding: glob: "*_raw_cells_count.png" doc: | - Number of cells per dataset (not filtered). - PNG format - - raw_cells_count_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_cells_count.pdf" - doc: | - Number of cells per dataset (not filtered). - PDF format + Number of cells per dataset. + Unfiltered. + PNG format. raw_umi_dnst_plot_png: type: File? outputBinding: glob: "*_raw_umi_dnst.png" doc: | - RNA reads per cell density (not filtered). - PNG format - - raw_umi_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_umi_dnst.pdf" - doc: | - RNA reads per cell density (not filtered). - PDF format + Distribution of RNA reads per cell. + Unfiltered. + PNG format. raw_gene_dnst_plot_png: type: File? outputBinding: glob: "*_raw_gene_dnst.png" doc: | - Genes per cell density (not filtered). - PNG format - - raw_gene_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_gene_dnst.pdf" - doc: | - Genes per cell density (not filtered). - PDF format + Distribution of genes per cell. + Unfiltered. + PNG format. raw_gene_umi_plot_png: type: File? outputBinding: glob: "*_raw_gene_umi.png" doc: | - Genes vs RNA reads per cell correlation (not filtered). - PNG format - - raw_gene_umi_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_gene_umi.pdf" - doc: | - Genes vs RNA reads per cell correlation (not filtered). - PDF format + Genes vs RNA reads per cell. + Unfiltered. + PNG format. raw_umi_mito_plot_png: type: File? outputBinding: glob: "*_raw_umi_mito.png" doc: | - RNA reads vs mitochondrial % per cell (not filtered). - PNG format - - raw_umi_mito_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_umi_mito.pdf" - doc: | - RNA reads vs mitochondrial % per cell (not filtered). - PDF format + RNA reads vs mitochondrial percentage + per cell. + Unfiltered. + PNG format. raw_mito_dnst_plot_png: type: File? outputBinding: glob: "*_raw_mito_dnst.png" doc: | - Percentage of RNA reads mapped to mitochondrial genes per cell density (not filtered). - PNG format - - raw_mito_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_mito_dnst.pdf" - doc: | - Percentage of RNA reads mapped to mitochondrial genes per cell density (not filtered). - PDF format + Distribution of RNA reads mapped + to mitochondrial genes per cell. + Unfiltered. + PNG format. raw_nvlt_dnst_plot_png: type: File? outputBinding: glob: "*_raw_nvlt_dnst.png" doc: | - Novelty score per cell density (not filtered). - PNG format - - raw_nvlt_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_nvlt_dnst.pdf" - doc: | - Novelty score per cell density (not filtered). - PDF format + Distribution of novelty score per cell. + Unfiltered. + PNG format. raw_qc_mtrcs_dnst_plot_png: type: File? outputBinding: glob: "*_raw_qc_mtrcs_dnst.png" doc: | - QC metrics per cell density (not filtered). - PNG format - - raw_qc_mtrcs_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_qc_mtrcs_dnst.pdf" - doc: | - QC metrics per cell density (not filtered). - PDF format + Distribution of QC metrics per cell. + Unfiltered. + PNG format. raw_rnadbl_plot_png: type: File? outputBinding: glob: "*_raw_rnadbl.png" doc: | - Percentage of RNA doublets per dataset (not filtered). - PNG format - - raw_rnadbl_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_rnadbl.pdf" - doc: | - Percentage of RNA doublets per dataset (not filtered). - PDF format + Percentage of RNA doublets. + Unfiltered. + PNG format. raw_umi_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_raw_umi_dnst_spl_cnd.png" doc: | - Split by grouping condition RNA reads per cell density (not filtered). - PNG format - - raw_umi_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_umi_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition RNA reads per cell density (not filtered). - PDF format + Distribution of RNA reads per cell. + Unfiltered; split by grouping condition. + PNG format. raw_gene_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_raw_gene_dnst_spl_cnd.png" doc: | - Split by grouping condition genes per cell density (not filtered). - PNG format - - raw_gene_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_gene_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition genes per cell density (not filtered). - PDF format + Distribution of genes per cell. + Unfiltered; split by grouping condition. + PNG format. raw_mito_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_raw_mito_dnst_spl_cnd.png" doc: | - Split by grouping condition the percentage of RNA reads mapped - to mitochondrial genes per cell density (not filtered). - PNG format - - raw_mito_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_mito_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition the percentage of RNA reads mapped - to mitochondrial genes per cell density (not filtered). - PDF format + Distribution of RNA reads mapped + to mitochondrial genes per cell. + Unfiltered; split by grouping condition. + PNG format. raw_nvlt_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_raw_nvlt_dnst_spl_cnd.png" doc: | - Split by grouping condition the novelty score per cell density (not filtered). - PNG format - - raw_nvlt_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_raw_nvlt_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition the novelty score per cell density (not filtered). - PDF format + Distribution of novelty score per cell. + Unfiltered; split by grouping condition. + PNG format. fltr_1_2_qc_mtrcs_pca_plot_png: type: File? outputBinding: glob: "*_fltr_1_2_qc_mtrcs_pca.png" doc: | - PC1 and PC2 from the QC metrics PCA (filtered). - PNG format - - fltr_1_2_qc_mtrcs_pca_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_1_2_qc_mtrcs_pca.pdf" - doc: | - PC1 and PC2 from the QC metrics PCA (filtered). - PDF format + QC metrics PCA. + Filtered; PC1/PC2. + PNG format. fltr_2_3_qc_mtrcs_pca_plot_png: type: File? outputBinding: glob: "*_fltr_2_3_qc_mtrcs_pca.png" doc: | - PC2 and PC3 from the QC metrics PCA (filtered). - PNG format - - fltr_2_3_qc_mtrcs_pca_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_2_3_qc_mtrcs_pca.pdf" - doc: | - PC2 and PC3 from the QC metrics PCA (filtered). - PDF format + QC metrics PCA. + Filtered; PC2/PC3. + PNG format. fltr_cells_count_plot_png: type: File? outputBinding: glob: "*_fltr_cells_count.png" doc: | - Number of cells per dataset (filtered). - PNG format - - fltr_cells_count_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_cells_count.pdf" - doc: | - Number of cells per dataset (filtered). - PDF format + Number of cells per dataset. + Filtered. + PNG format. fltr_umi_dnst_plot_png: type: File? outputBinding: glob: "*_fltr_umi_dnst.png" doc: | - RNA reads per cell density (filtered). - PNG format - - fltr_umi_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_umi_dnst.pdf" - doc: | - RNA reads per cell density (filtered). - PDF format + Distribution of RNA reads per cell. + Filtered. + PNG format. fltr_gene_dnst_plot_png: type: File? outputBinding: glob: "*_fltr_gene_dnst.png" doc: | - Genes per cell density (filtered). - PNG format - - fltr_gene_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_gene_dnst.pdf" - doc: | - Genes per cell density (filtered). - PDF format + Distribution of genes per cell. + Filtered. + PNG format. fltr_gene_umi_plot_png: type: File? outputBinding: glob: "*_fltr_gene_umi.png" doc: | - Genes vs RNA reads per cell correlation (filtered). - PNG format - - fltr_gene_umi_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_gene_umi.pdf" - doc: | - Genes vs RNA reads per cell correlation (filtered). - PDF format + Genes vs RNA reads per cell. + Filtered. + PNG format. fltr_umi_mito_plot_png: type: File? outputBinding: glob: "*_fltr_umi_mito.png" doc: | - RNA reads vs mitochondrial % per cell (filtered). - PNG format - - fltr_umi_mito_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_umi_mito.pdf" - doc: | - RNA reads vs mitochondrial % per cell (filtered). - PDF format + RNA reads vs mitochondrial percentage + per cell. + Filtered. + PNG format. fltr_mito_dnst_plot_png: type: File? outputBinding: glob: "*_fltr_mito_dnst.png" doc: | - Percentage of RNA reads mapped to mitochondrial genes per cell density (filtered). - PNG format - - fltr_mito_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_mito_dnst.pdf" - doc: | - Percentage of RNA reads mapped to mitochondrial genes per cell density (filtered). - PDF format + Distribution of RNA reads mapped + to mitochondrial genes per cell. + Filtered. + PNG format. fltr_nvlt_dnst_plot_png: type: File? outputBinding: glob: "*_fltr_nvlt_dnst.png" doc: | - Novelty score per cell density (filtered). - PNG format - - fltr_nvlt_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_nvlt_dnst.pdf" - doc: | - Novelty score per cell density (filtered). - PDF format + Distribution of novelty score per cell. + Filtered. + PNG format. fltr_qc_mtrcs_dnst_plot_png: type: File? outputBinding: glob: "*_fltr_qc_mtrcs_dnst.png" doc: | - QC metrics per cell density (filtered). - PNG format - - fltr_qc_mtrcs_dnst_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_qc_mtrcs_dnst.pdf" - doc: | - QC metrics per cell density (filtered). - PDF format + Distribution of QC metrics per cell. + Filtered. + PNG format. fltr_rnadbl_plot_png: type: File? outputBinding: glob: "*_fltr_rnadbl.png" doc: | - Percentage of RNA doublets per dataset (filtered). - PNG format - - fltr_rnadbl_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_rnadbl.pdf" - doc: | - Percentage of RNA doublets per dataset (filtered). - PDF format + Percentage of RNA doublets. + Filtered. + PNG format. fltr_umi_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_fltr_umi_dnst_spl_cnd.png" doc: | - Split by grouping condition RNA reads per cell density (filtered). - PNG format - - fltr_umi_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_umi_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition RNA reads per cell density (filtered). - PDF format + Distribution of RNA reads per cell. + Filtered; split by grouping condition. + PNG format. fltr_gene_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_fltr_gene_dnst_spl_cnd.png" doc: | - Split by grouping condition genes per cell density (filtered). - PNG format - - fltr_gene_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_gene_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition genes per cell density (filtered). - PDF format + Distribution of genes per cell. + Filtered; split by grouping condition. + PNG format. fltr_mito_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_fltr_mito_dnst_spl_cnd.png" doc: | - Split by grouping condition the percentage of RNA reads mapped - to mitochondrial genes per cell density (filtered). - PNG format - - fltr_mito_dnst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_fltr_mito_dnst_spl_cnd.pdf" - doc: | - Split by grouping condition the percentage of RNA reads mapped - to mitochondrial genes per cell density (filtered). - PDF format + Distribution of RNA reads mapped + to mitochondrial genes per cell. + Filtered; split by grouping condition. + PNG format. fltr_nvlt_dnst_spl_cnd_plot_png: type: File? outputBinding: glob: "*_fltr_nvlt_dnst_spl_cnd.png" doc: | - Split by grouping condition the novelty score per cell density (filtered). - PNG format + Distribution of novelty score per cell. + Filtered; split by grouping condition. + PNG format. - fltr_nvlt_dnst_spl_cnd_plot_pdf: - type: File? + all_plots_pdf: + type: + - "null" + - type: array + items: File outputBinding: - glob: "*_fltr_nvlt_dnst_spl_cnd.pdf" + glob: "*.pdf" doc: | - Split by grouping condition the novelty score per cell density (filtered). - PDF format + All generated plots. + PDF format. ucsc_cb_config_data: type: Directory? @@ -845,7 +648,7 @@ $schemas: label: "Single-Cell RNA-Seq Filtering Analysis" s:name: "Single-Cell RNA-Seq Filtering Analysis" -s:alternateName: "Filters single-cell RNA-Seq datasets based on the common QC metrics" +s:alternateName: "Single-Cell RNA-Seq Filtering Analysis" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-rna-filter.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -885,11 +688,15 @@ s:creator: doc: | Single-Cell RNA-Seq Filtering Analysis - Filters single-cell RNA-Seq datasets based on the common QC metrics. + Removes low-quality cells from the outputs of the “Cell + Ranger Count (RNA)”, “Cell Ranger Count (RNA+VDJ)”, and + “Cell Ranger Aggregate (RNA, RNA+VDJ)” pipelines. The + results of this workflow are used in the “Single-Cell + RNA-Seq Dimensionality Reduction Analysis” pipeline. s:about: | - usage: /usr/local/bin/sc_rna_filter.R [-h] --mex MEX [MEX ...] --identity + usage: sc_rna_filter.R [-h] --mex MEX [MEX ...] --identity IDENTITY [--grouping GROUPING] [--barcodes BARCODES] [--rnamincells RNAMINCELLS] diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index 32e9f329..f28d4226 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.34 + dockerPull: biowardrobe2/sc-tools:v0.0.35 inputs: diff --git a/tools/sc-rna-trajectory.cwl b/tools/sc-rna-trajectory.cwl index c985a914..1b10e470 100644 --- a/tools/sc-rna-trajectory.cwl +++ b/tools/sc-rna-trajectory.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.34 + dockerPull: biowardrobe2/sc-tools:v0.0.35 inputs: diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index 43a57651..c7f1e0a4 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.34 + dockerPull: biowardrobe2/sc-tools:v0.0.35 inputs: diff --git a/tools/sc-vdj-profile.cwl b/tools/sc-vdj-profile.cwl index a4ababfa..ab72d050 100644 --- a/tools/sc-vdj-profile.cwl +++ b/tools/sc-vdj-profile.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.34 + dockerPull: biowardrobe2/sc-tools:v0.0.35 inputs: diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index 112c61b6..aa41b137 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.34 + dockerPull: biowardrobe2/sc-tools:v0.0.35 inputs: diff --git a/workflows/sc-atac-filter.cwl b/workflows/sc-atac-filter.cwl new file mode 100644 index 00000000..3efb876d --- /dev/null +++ b/workflows/sc-atac-filter.cwl @@ -0,0 +1,936 @@ +cwlVersion: v1.1 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var split_numbers = function(line) { + let splitted_line = line?line.split(/[\s,]+/).map(parseFloat):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + + +"sd:upstream": + sc_sample: + - "cellranger-atac-count.cwl" + - "cellranger-atac-aggr.cwl" + - "cellranger-arc-count.cwl" + - "cellranger-arc-aggr.cwl" + + +inputs: + + alias: + type: string + label: "Analysis name" + sd:preview: + position: 1 + + filtered_feature_bc_matrix_folder: + type: File + label: "Cell Ranger ATAC or RNA+ATAC Sample" + doc: | + Any "Cell Ranger ATAC or RNA+ATAC + Sample" that produces either only + chromatin accessibility or both gene + expression and chromatin accessibility + data in a form of a single compressed + feature-barcode matrix in a MEX + format, ATAC fragments file in TSV + format, and optional aggregation + metadata file in TSV/CSV format. + This sample can be obtained from + "Cell Ranger Count (ATAC)", + "Cell Ranger Count (RNA+ATAC)", + "Cell Ranger Aggregate (ATAC)", or + "Cell Ranger Aggregate (RNA+ATAC)" + pipelines. If present, gene expression + data will be discarded. + "sd:upstreamSource": "sc_sample/filtered_feature_bc_matrix_folder" + "sd:localLabel": true + + atac_fragments_file: + type: File + secondaryFiles: + - .tbi + "sd:upstreamSource": "sc_sample/atac_fragments_file" + + aggregation_metadata: + type: File? + "sd:upstreamSource": "sc_sample/aggregation_metadata" + + annotation_gtf_file: + type: File + "sd:upstreamSource": "sc_sample/genome_indices/genome_indices/annotation_gtf" + + chrom_length_file: + type: File + "sd:upstreamSource": "sc_sample/genome_indices/chrom_length_file" + + blacklist_regions_file: + type: + - "null" + - type: enum + symbols: + - "hg19" + - "hg38" + - "mm10" + "sd:upstreamSource": "sc_sample/genome_indices/genome_indices/genome" + + grouping_data: + type: File? + label: "Datasets grouping (optional)" + doc: | + If the selected "Cell Ranger ATAC or + RNA+ATAC Sample" includes multiple + aggregated datasets, each dataset can + be assigned to a separate group by + providing a TSV/CSV file with "library_id" + and "condition" columns. Obtain this file + from the "aggregation_metadata.csv" + output generated by "Cell Ranger ATAC or + RNA+ATAC Sample" and accessible on the + "Files" tab. Remove all columns except + the "library_id". Add the group names + for each dataset in a separate column + named "condition". + + barcodes_data: + type: File? + label: "Selected cell barcodes (optional)" + doc: | + A TSV/CSV file to optionally prefilter + the single cell data by including only + the cells with the selected barcodes. + The provided file should include at + least one column named "barcode", with + one cell barcode per line. All other + columns, except for "barcode", will be + added to the single cell metadata loaded + from "Cell Ranger ATAC or RNA+ATAC Sample" + and can be utilized in the current or + future steps of analysis. + + call_by: + type: string? + default: "" + label: "Cells grouping for MACS2 peak calling" + doc: | + Single cell metadata column to be used + for cells grouping before using MACS2 + to replace 10x peaks with the new ones. + To group cells by dataset, use "dataset". + Custom groups can be defined based on + any single cell metadata added through + the "Selected cell barcodes (optional)" + input. Default: use the original peaks + generated by Cell Ranger ATAC or + RNA+ATAC Sample. + "sd:layout": + advanced: true + + minimum_qvalue: + type: float? + default: 0.05 + label: "Minimum MACS2 FDR" + doc: | + Minimum FDR (q-value) cutoff for MACS2 peak + detection. Ignored if "Cells grouping for + MACS2 peak calling" input is not provided. + Default: 0.05 + "sd:layout": + advanced: true + + remove_doublets: + type: boolean? + default: false + label: "Remove doublets" + doc: | + Quality control filtering parameter + to remove cells identified as doublets. + Default: do not remove + "sd:layout": + advanced: true + + minimum_fragments: + type: string? + default: "1000" + label: "Minimum number of ATAC fragments in peaks per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the number of ATAC fragments + in peaks smaller than the provided + value. If the selected "Cell Ranger + ATAC or RNA+ATAC Sample" includes multiple + aggregated datasets, each of them can + be filtered independently by providing + comma or space-separated list of filtering + thresholds. The order and number of + the specified values need to match + with the datasets order from the + "aggregation_metadata.csv" output + generated by "Cell Ranger ATAC or + RNA+ATAC Sample" and accessible on + the "Files" tab. Any 0 will be replaced + with the auto-estimated threshold + (median - 2.5 * MAD) calculated per dataset. + Default: 1000 + "sd:layout": + advanced: true + + minimum_tss_enrich: + type: string? + default: "2" + label: "Minimum TSS enrichment score per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the TSS enrichment score + smaller than the provided value. + This QC metrics is calculated based + on the ratio of ATAC fragments + centered at the genes TSS to ATAC + fragments in the TSS-flanking regions. + If the selected "Cell Ranger ATAC or + RNA+ATAC Sample" includes multiple + aggregated datasets, each of them can + be filtered independently by providing + comma or space-separated list of + filtering thresholds. The order and + number of the specified values need + to match with the datasets order from + the "aggregation_metadata.csv" output + generated by "Cell Ranger ATAC or + RNA+ATAC Sample" and accessible on + the "Files" tab. + Default: 2 + "sd:layout": + advanced: true + + minimum_frip: + type: float? + default: 0.15 + label: "Minimum FRiP per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the FRiP (Fraction of + Reads in Peaks) smaller than the + provided value. + Default: 0.15 + "sd:layout": + advanced: true + + maximum_nucl_signal: + type: string? + default: "4" + label: "Maximum nucleosome signal per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the nucleosome signal + higher than the provided value. + Nucleosome signal is a measurement + of nucleosome occupancy. It quantifies + the approximate ratio of mononucleosomal + to nucleosome-free ATAC fragments. + If the selected "Cell Ranger ATAC or + RNA+ATAC Sample" includes multiple + aggregated datasets, each of them can + be filtered independently by providing + comma or space-separated list of + filtering thresholds. The order and + number of the specified values need + to match with the datasets order from + the "aggregation_metadata.csv" output + generated by "Cell Ranger ATAC or + RNA+ATAC Sample" and accessible on + the "Files" tab. + Default: 4 + "sd:layout": + advanced: true + + maximum_blacklist_fraction: + type: string? + default: "0.05" + label: "Maximum blacklist fraction per cell" + doc: | + Quality control filtering threshold + to exclude from the analysis all + cells with the fraction of ATAC + fragments in genomic blacklist regions + bigger than the provided value. + If the selected "Cell Ranger ATAC or + RNA+ATAC Sample" includes multiple + aggregated datasets, each of them can + be filtered independently by providing + comma or space-separated list of + filtering thresholds. The order and + number of the specified values need + to match with the datasets order from + the "aggregation_metadata.csv" output + generated by "Cell Ranger ATAC or + RNA+ATAC Sample" and accessible on + the "Files" tab. + Default: 0.05 + "sd:layout": + advanced: true + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + default: "classic" + label: "Plots color theme" + doc: | + Color theme for all plots saved + as PNG files. + Default: classic + "sd:layout": + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + - "2" + - "3" + - "4" + - "5" + - "6" + default: "4" + label: "Cores/CPUs" + doc: | + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 4 + "sd:layout": + advanced: true + + +outputs: + + raw_1_2_qc_mtrcs_pca_plot_png: + type: File? + outputSource: sc_atac_filter/raw_1_2_qc_mtrcs_pca_plot_png + label: "QC metrics PCA (unfiltered, PC1/PC2)" + doc: | + QC metrics PCA. + Unfiltered; PC1/PC2. + PNG format. + "sd:visualPlugins": + - image: + tab: "Unfiltered" + Caption: "QC metrics PCA (unfiltered, PC1/PC2)" + + raw_2_3_qc_mtrcs_pca_plot_png: + type: File? + outputSource: sc_atac_filter/raw_2_3_qc_mtrcs_pca_plot_png + label: "QC metrics PCA (unfiltered, PC2/PC3)" + doc: | + QC metrics PCA. + Unfiltered; PC2/PC3. + PNG format. + "sd:visualPlugins": + - image: + tab: "Unfiltered" + Caption: "QC metrics PCA (unfiltered, PC2/PC3)" + + raw_cells_count_plot_png: + type: File? + outputSource: sc_atac_filter/raw_cells_count_plot_png + label: "Number of cells per dataset (unfiltered)" + doc: | + Number of cells per dataset. + Unfiltered. + PNG format. + "sd:visualPlugins": + - image: + tab: "Unfiltered" + Caption: "Number of cells per dataset (unfiltered)" + + raw_frgm_dnst_plot_png: + type: File? + outputSource: sc_atac_filter/raw_frgm_dnst_plot_png + label: "Distribution of ATAC fragments in peaks per cell (unfiltered)" + doc: | + Distribution of ATAC fragments in peaks + per cell. + Unfiltered. + PNG format. + "sd:visualPlugins": + - image: + tab: "Unfiltered" + Caption: "Distribution of ATAC fragments in peaks per cell (unfiltered)" + + raw_peak_dnst_plot_png: + type: File? + outputSource: sc_atac_filter/raw_peak_dnst_plot_png + label: "Distribution of peaks per cell (unfiltered)" + doc: | + Distribution of peaks per cell. + Unfiltered. + PNG format. + "sd:visualPlugins": + - image: + tab: "Unfiltered" + Caption: "Distribution of peaks per cell (unfiltered)" + + raw_blck_dnst_plot_png: + type: File? + outputSource: sc_atac_filter/raw_blck_dnst_plot_png + label: "Distribution of ATAC fragments within genomic blacklist regions per cell (unfiltered)" + doc: | + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Unfiltered. + PNG format. + "sd:visualPlugins": + - image: + tab: "Unfiltered" + Caption: "Distribution of ATAC fragments within genomic blacklist regions per cell (unfiltered)" + + raw_tss_frgm_plot_png: + type: File? + outputSource: sc_atac_filter/raw_tss_frgm_plot_png + label: "TSS enrichment score vs ATAC fragments in peaks per cell (unfiltered)" + doc: | + TSS enrichment score vs ATAC + fragments in peaks per cell. + Unfiltered. + PNG format. + "sd:visualPlugins": + - image: + tab: "Unfiltered" + Caption: "TSS enrichment score vs ATAC fragments in peaks per cell (unfiltered)" + + raw_qc_mtrcs_dnst_plot_png: + type: File? + outputSource: sc_atac_filter/raw_qc_mtrcs_dnst_plot_png + label: "Distribution of QC metrics per cell (unfiltered)" + doc: | + Distribution of QC metrics per cell. + Unfiltered. + PNG format. + "sd:visualPlugins": + - image: + tab: "Unfiltered" + Caption: "Distribution of QC metrics per cell (unfiltered)" + + raw_atacdbl_plot_png: + type: File? + outputSource: sc_atac_filter/raw_atacdbl_plot_png + label: "Percentage of ATAC doublets (unfiltered)" + doc: | + Percentage of ATAC doublets. + Unfiltered. + PNG format. + "sd:visualPlugins": + - image: + tab: "Unfiltered" + Caption: "Percentage of ATAC doublets (unfiltered)" + + raw_tss_nrch_plot_png: + type: File? + outputSource: sc_atac_filter/raw_tss_nrch_plot_png + label: "Signal enrichment around TSS (unfiltered, split by the minimum TSS enrichment score threshold)" + doc: | + Signal enrichment around TSS. + Unfiltered; split by the minimum + TSS enrichment score threshold. + PNG format. + "sd:visualPlugins": + - image: + tab: "Unfiltered" + Caption: "Signal enrichment around TSS (unfiltered, split by the minimum TSS enrichment score threshold)" + + raw_frgm_hist_png: + type: File? + outputSource: sc_atac_filter/raw_frgm_hist_png + label: "Histogram of ATAC fragment length (unfiltered, split by the maximum nucleosome signal threshold)" + doc: | + Histogram of ATAC fragment length. + Unfiltered; split by the maximum + nucleosome signal threshold. + PNG format. + "sd:visualPlugins": + - image: + tab: "Unfiltered" + Caption: "Histogram of ATAC fragment length (unfiltered, split by the maximum nucleosome signal threshold)" + + raw_frgm_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_atac_filter/raw_frgm_dnst_spl_cnd_plot_png + label: "Distribution of ATAC fragments in peaks per cell (unfiltered, split by grouping condition)" + doc: | + Distribution of ATAC fragments in peaks + per cell. + Unfiltered; split by grouping condition. + PNG format. + "sd:visualPlugins": + - image: + tab: "Unfiltered, split by group" + Caption: "Distribution of ATAC fragments in peaks per cell (unfiltered, split by grouping condition)" + + raw_peak_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_atac_filter/raw_peak_dnst_spl_cnd_plot_png + label: "Distribution of peaks per cell (unfiltered, split by grouping condition)" + doc: | + Distribution of peaks per cell. + Unfiltered; split by grouping condition. + PNG format. + "sd:visualPlugins": + - image: + tab: "Unfiltered, split by group" + Caption: "Distribution of peaks per cell (unfiltered, split by grouping condition)" + + raw_blck_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_atac_filter/raw_blck_dnst_spl_cnd_plot_png + label: "Distribution of ATAC fragments within genomic blacklist regions per cell (unfiltered, split by grouping condition)" + doc: | + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Unfiltered; split by grouping condition. + PNG format. + "sd:visualPlugins": + - image: + tab: "Unfiltered, split by group" + Caption: "Distribution of ATAC fragments within genomic blacklist regions per cell (unfiltered, split by grouping condition)" + + fltr_1_2_qc_mtrcs_pca_plot_png: + type: File? + outputSource: sc_atac_filter/fltr_1_2_qc_mtrcs_pca_plot_png + label: "QC metrics PCA (filtered, PC1/PC2)" + doc: | + QC metrics PCA. + Filtered; PC1/PC2. + PNG format. + "sd:visualPlugins": + - image: + tab: "Filtered" + Caption: "QC metrics PCA (filtered, PC1/PC2)" + + fltr_2_3_qc_mtrcs_pca_plot_png: + type: File? + outputSource: sc_atac_filter/fltr_2_3_qc_mtrcs_pca_plot_png + label: "QC metrics PCA (filtered, PC2/PC3)" + doc: | + QC metrics PCA. + Filtered; PC2/PC3. + PNG format. + "sd:visualPlugins": + - image: + tab: "Filtered" + Caption: "QC metrics PCA (filtered, PC2/PC3)" + + fltr_cells_count_plot_png: + type: File? + outputSource: sc_atac_filter/fltr_cells_count_plot_png + label: "Number of cells per dataset (filtered)" + doc: | + Number of cells per dataset. + Filtered. + PNG format. + "sd:visualPlugins": + - image: + tab: "Filtered" + Caption: "Number of cells per dataset (filtered)" + + fltr_frgm_dnst_plot_png: + type: File? + outputSource: sc_atac_filter/fltr_frgm_dnst_plot_png + label: "Distribution of ATAC fragments in peaks per cell (filtered)" + doc: | + Distribution of ATAC fragments in peaks + per cell. + Filtered. + PNG format. + "sd:visualPlugins": + - image: + tab: "Filtered" + Caption: "Distribution of ATAC fragments in peaks per cell (filtered)" + + fltr_peak_dnst_plot_png: + type: File? + outputSource: sc_atac_filter/fltr_peak_dnst_plot_png + label: "Distribution of peaks per cell (filtered)" + doc: | + Distribution of peaks per cell. + Filtered. + PNG format. + "sd:visualPlugins": + - image: + tab: "Filtered" + Caption: "Distribution of peaks per cell (filtered)" + + fltr_blck_dnst_plot_png: + type: File? + outputSource: sc_atac_filter/fltr_blck_dnst_plot_png + label: "Distribution of ATAC fragments within genomic blacklist regions per cell (filtered)" + doc: | + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Filtered. + PNG format. + "sd:visualPlugins": + - image: + tab: "Filtered" + Caption: "Distribution of ATAC fragments within genomic blacklist regions per cell (filtered)" + + fltr_tss_frgm_plot_png: + type: File? + outputSource: sc_atac_filter/fltr_tss_frgm_plot_png + label: "TSS enrichment score vs ATAC fragments in peaks per cell (filtered)" + doc: | + TSS enrichment score vs ATAC + fragments in peaks per cell. + Filtered. + PNG format. + "sd:visualPlugins": + - image: + tab: "Filtered" + Caption: "TSS enrichment score vs ATAC fragments in peaks per cell (filtered)" + + fltr_qc_mtrcs_dnst_plot_png: + type: File? + outputSource: sc_atac_filter/fltr_qc_mtrcs_dnst_plot_png + label: "Distribution of QC metrics per cell (filtered)" + doc: | + Distribution of QC metrics per cell. + Filtered. + PNG format. + "sd:visualPlugins": + - image: + tab: "Filtered" + Caption: "Distribution of QC metrics per cell (filtered)" + + fltr_atacdbl_plot_png: + type: File? + outputSource: sc_atac_filter/fltr_atacdbl_plot_png + label: "Percentage of ATAC doublets (filtered)" + doc: | + Percentage of ATAC doublets. + Filtered. + PNG format. + "sd:visualPlugins": + - image: + tab: "Filtered" + Caption: "Percentage of ATAC doublets (filtered)" + + fltr_tss_nrch_plot_png: + type: File? + outputSource: sc_atac_filter/fltr_tss_nrch_plot_png + label: "Signal enrichment around TSS (filtered, split by the minimum TSS enrichment score threshold)" + doc: | + Signal enrichment around TSS. + Filtered; split by the minimum + TSS enrichment score threshold. + PNG format. + "sd:visualPlugins": + - image: + tab: "Filtered" + Caption: "Signal enrichment around TSS (filtered, split by the minimum TSS enrichment score threshold)" + + fltr_frgm_hist_png: + type: File? + outputSource: sc_atac_filter/fltr_frgm_hist_png + label: "Histogram of ATAC fragment length (filtered, split by the maximum nucleosome signal threshold)" + doc: | + Histogram of ATAC fragment length. + Filtered; split by the maximum + nucleosome signal threshold. + PNG format. + "sd:visualPlugins": + - image: + tab: "Filtered" + Caption: "Histogram of ATAC fragment length (filtered, split by the maximum nucleosome signal threshold)" + + fltr_frgm_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_atac_filter/fltr_frgm_dnst_spl_cnd_plot_png + label: "Distribution of ATAC fragments in peaks per cell (filtered, split by grouping condition)" + doc: | + Distribution of ATAC fragments in peaks + per cell. + Filtered; split by grouping condition. + PNG format. + "sd:visualPlugins": + - image: + tab: "Filtered, split by group" + Caption: "Distribution of ATAC fragments in peaks per cell (filtered, split by grouping condition)" + + fltr_peak_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_atac_filter/fltr_peak_dnst_spl_cnd_plot_png + label: "Distribution of peaks per cell (filtered, split by grouping condition)" + doc: | + Distribution of peaks per cell. + Filtered; split by grouping condition. + PNG format. + "sd:visualPlugins": + - image: + tab: "Filtered, split by group" + Caption: "Distribution of peaks per cell (filtered, split by grouping condition)" + + fltr_blck_dnst_spl_cnd_plot_png: + type: File? + outputSource: sc_atac_filter/fltr_blck_dnst_spl_cnd_plot_png + label: "Distribution of ATAC fragments within genomic blacklist regions per cell (filtered, split by grouping condition)" + doc: | + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Filtered; split by grouping condition. + PNG format. + "sd:visualPlugins": + - image: + tab: "Filtered, split by group" + Caption: "Distribution of ATAC fragments within genomic blacklist regions per cell (filtered, split by grouping condition)" + + ucsc_cb_html_data: + type: Directory + outputSource: sc_atac_filter/ucsc_cb_html_data + label: "UCSC Cell Browser (data)" + doc: | + UCSC Cell Browser html data. + + ucsc_cb_html_file: + type: File + outputSource: sc_atac_filter/ucsc_cb_html_file + label: "UCSC Cell Browser" + doc: | + UCSC Cell Browser html index. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + + seurat_data_rds: + type: File + outputSource: sc_atac_filter/seurat_data_rds + label: "Seurat object in RDS format" + doc: | + Seurat object. + RDS format. + + datasets_metadata: + type: File + outputSource: sc_atac_filter/datasets_metadata + label: "Example of datasets metadata" + doc: | + Example of datasets metadata file + in TSV format + + pdf_plots: + type: File + outputSource: compress_pdf_plots/compressed_folder + label: "Compressed folder with all PDF plots" + doc: | + Compressed folder with all PDF plots. + + sc_atac_filter_stdout_log: + type: File + outputSource: sc_atac_filter/stdout_log + label: "Output log" + doc: | + Stdout log from the sc_atac_filter step. + + sc_atac_filter_stderr_log: + type: File + outputSource: sc_atac_filter/stderr_log + label: "Error log" + doc: | + Stderr log from the sc_atac_filter step. + + +steps: + + uncompress_feature_bc_matrices: + run: ../tools/tar-extract.cwl + in: + file_to_extract: filtered_feature_bc_matrix_folder + out: + - extracted_folder + + sc_atac_filter: + run: ../tools/sc-atac-filter.cwl + in: + feature_bc_matrices_folder: uncompress_feature_bc_matrices/extracted_folder + aggregation_metadata: aggregation_metadata + atac_fragments_file: atac_fragments_file + annotation_gtf_file: annotation_gtf_file + chrom_length_file: chrom_length_file + grouping_data: grouping_data + blacklist_regions_file: blacklist_regions_file + barcodes_data: barcodes_data + call_by: + source: call_by + valueFrom: | + ${ + if (self == "dataset") { + return "new.ident"; + } else if (self == "") { + return null; + } else { + return self; + } + } + minimum_qvalue: minimum_qvalue + atac_minimum_cells: + default: 1 # will remove peaks that are not present in any of the cells + minimum_fragments: + source: minimum_fragments + valueFrom: $(split_numbers(self)) + maximum_nucl_signal: + source: maximum_nucl_signal + valueFrom: $(split_numbers(self)) + minimum_tss_enrich: + source: minimum_tss_enrich + valueFrom: $(split_numbers(self)) + minimum_frip: minimum_frip + maximum_blacklist_fraction: + source: maximum_blacklist_fraction + valueFrom: $(split_numbers(self)) + remove_doublets: remove_doublets + verbose: + default: true + export_ucsc_cb: + default: true + export_pdf_plots: + default: true + color_theme: color_theme + parallel_memory_limit: + default: 32 + vector_memory_limit: + default: 96 + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - raw_1_2_qc_mtrcs_pca_plot_png + - raw_2_3_qc_mtrcs_pca_plot_png + - raw_cells_count_plot_png + - raw_frgm_dnst_plot_png + - raw_peak_dnst_plot_png + - raw_blck_dnst_plot_png + - raw_tss_frgm_plot_png + - raw_qc_mtrcs_dnst_plot_png + - raw_atacdbl_plot_png + - raw_tss_nrch_plot_png + - raw_frgm_hist_png + - raw_frgm_dnst_spl_cnd_plot_png + - raw_peak_dnst_spl_cnd_plot_png + - raw_blck_dnst_spl_cnd_plot_png + - fltr_1_2_qc_mtrcs_pca_plot_png + - fltr_2_3_qc_mtrcs_pca_plot_png + - fltr_cells_count_plot_png + - fltr_frgm_dnst_plot_png + - fltr_peak_dnst_plot_png + - fltr_blck_dnst_plot_png + - fltr_atacdbl_plot_png + - fltr_tss_frgm_plot_png + - fltr_qc_mtrcs_dnst_plot_png + - fltr_tss_nrch_plot_png + - fltr_frgm_hist_png + - fltr_frgm_dnst_spl_cnd_plot_png + - fltr_peak_dnst_spl_cnd_plot_png + - fltr_blck_dnst_spl_cnd_plot_png + - all_plots_pdf + - ucsc_cb_html_data + - ucsc_cb_html_file + - seurat_data_rds + - datasets_metadata + - stdout_log + - stderr_log + + folder_pdf_plots: + run: ../tools/files-to-folder.cwl + in: + input_files: + source: + - sc_atac_filter/all_plots_pdf + valueFrom: $(self.flat().filter(n => n)) + folder_basename: + default: "pdf_plots" + out: + - folder + + compress_pdf_plots: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: folder_pdf_plots/folder + out: + - compressed_folder + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Single-Cell ATAC-Seq Filtering Analysis" +s:name: "Single-Cell ATAC-Seq Filtering Analysis" +s:alternateName: "Single-Cell ATAC-Seq Filtering Analysis" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-atac-filter.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-Cell ATAC-Seq Filtering Analysis + + Removes low-quality cells from the outputs of either the + “Cell Ranger Count (ATAC)” or “Cell Ranger Aggregate (ATAC)” + pipeline. The results of this workflow are used in the + “Single-Cell ATAC-Seq Dimensionality Reduction Analysis” + pipeline. \ No newline at end of file diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index c4e0ca8b..685a46d8 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -20,6 +20,7 @@ requirements: sc_tools_sample: - "sc-rna-cluster.cwl" - "sc-rna-reduce.cwl" + - "sc-atac-filter.cwl" - "sc-multiome-filter.cwl" diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index 606e8869..1b80f3e9 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -356,8 +356,8 @@ inputs: advanced: true minimum_frip: - type: string? - default: "0.15" + type: float? + default: 0.15 label: "Minimum FRiP per cell" doc: | Quality control filtering threshold @@ -365,18 +365,6 @@ inputs: cells with the FRiP (Fraction of Reads in Peaks) smaller than the provided value. - If the selected "Cell Ranger RNA+ATAC - Sample" includes multiple aggregated - datasets, each of them can be filtered - independently by providing comma or - space-separated list of filtering - thresholds. The order and number of - the specified values need to match - with the datasets order from the - "aggregation_metadata.csv" output - generated by "Cell Ranger RNA+ATAC - Sample" and accessible on the "Files" - tab. Default: 0.15 "sd:layout": advanced: true @@ -485,667 +473,726 @@ outputs: raw_1_2_qc_mtrcs_pca_plot_png: type: File? outputSource: sc_multiome_filter/raw_1_2_qc_mtrcs_pca_plot_png - label: "QC metrics PCA (1,2), raw" + label: "QC metrics PCA (unfiltered, PC1/PC2)" doc: | - PC1 and PC2 from the QC metrics - PCA for raw data + QC metrics PCA. + Unfiltered; PC1/PC2. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "QC metrics PCA (1,2)" + tab: "Unfiltered" + Caption: "QC metrics PCA (unfiltered, PC1/PC2)" raw_2_3_qc_mtrcs_pca_plot_png: type: File? outputSource: sc_multiome_filter/raw_2_3_qc_mtrcs_pca_plot_png - label: "QC metrics PCA (2,3), raw" + label: "QC metrics PCA (unfiltered, PC2/PC3)" doc: | - PC2 and PC3 from the QC metrics - PCA for raw data + QC metrics PCA. + Unfiltered; PC2/PC3. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "QC metrics PCA (2,3)" + tab: "Unfiltered" + Caption: "QC metrics PCA (unfiltered, PC2/PC3)" raw_cells_count_plot_png: type: File? outputSource: sc_multiome_filter/raw_cells_count_plot_png - label: "Cells per dataset, raw" + label: "Number of cells per dataset (unfiltered)" doc: | - Number of cells per dataset - for raw data + Number of cells per dataset. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "Cells per dataset" + tab: "Unfiltered" + Caption: "Number of cells per dataset (unfiltered)" raw_umi_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_umi_dnst_plot_png - label: "RNA reads per cell, raw" + label: "Distribution of RNA reads per cell (unfiltered)" doc: | - RNA reads per cell density - for raw data + Distribution of RNA reads per cell. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "RNA reads per cell" + tab: "Unfiltered" + Caption: "Distribution of RNA reads per cell (unfiltered)" raw_gene_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_gene_dnst_plot_png - label: "Genes per cell, raw" + label: "Distribution of genes per cell (unfiltered)" doc: | - Genes per cell density - for raw data + Distribution of genes per cell. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "Genes per cell" + tab: "Unfiltered" + Caption: "Distribution of genes per cell (unfiltered)" raw_gene_umi_plot_png: type: File? outputSource: sc_multiome_filter/raw_gene_umi_plot_png - label: "Genes vs RNA reads, raw" + label: "Genes vs RNA reads per cell (unfiltered)" doc: | - Genes vs RNA reads per cell - for raw data + Genes vs RNA reads per cell. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "Genes vs RNA reads" + tab: "Unfiltered" + Caption: "Genes vs RNA reads per cell (unfiltered)" raw_umi_mito_plot_png: type: File? outputSource: sc_multiome_filter/raw_umi_mito_plot_png - label: "RNA reads vs mitochondrial %, raw" + label: "RNA reads vs mitochondrial percentage per cell (unfiltered)" doc: | - RNA reads vs mitochondrial % per cell - for raw data + RNA reads vs mitochondrial percentage + per cell. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "RNA reads vs mitochondrial %" + tab: "Unfiltered" + Caption: "RNA reads vs mitochondrial percentage per cell (unfiltered)" raw_mito_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_mito_dnst_plot_png - label: "Mitochondrial percentage, raw" + label: "Distribution of RNA reads mapped to mitochondrial genes per cell (unfiltered)" doc: | - Percentage of RNA reads mapped to - mitochondrial genes per cell density - for raw data + Distribution of RNA reads mapped + to mitochondrial genes per cell. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "Mitochondrial percentage" + tab: "Unfiltered" + Caption: "Distribution of RNA reads mapped to mitochondrial genes per cell (unfiltered)" raw_nvlt_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_nvlt_dnst_plot_png - label: "Novelty score, raw" + label: "Distribution of novelty score per cell (unfiltered)" doc: | - Novelty score per cell density - for raw data + Distribution of novelty score per cell. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "Novelty score" + tab: "Unfiltered" + Caption: "Distribution of novelty score per cell (unfiltered)" raw_frgm_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_frgm_dnst_plot_png - label: "ATAC fragments in peaks per cell, raw" + label: "Distribution of ATAC fragments in peaks per cell (unfiltered)" doc: | - ATAC fragments in peaks per - cell density for raw data + Distribution of ATAC fragments in peaks + per cell. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "ATAC fragments in peaks per cell" + tab: "Unfiltered" + Caption: "Distribution of ATAC fragments in peaks per cell (unfiltered)" raw_peak_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_peak_dnst_plot_png - label: "Peaks per cell, raw" + label: "Distribution of peaks per cell (unfiltered)" doc: | - Peaks per cell density - for raw data + Distribution of peaks per cell. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "Peaks per cell" + tab: "Unfiltered" + Caption: "Distribution of peaks per cell (unfiltered)" raw_blck_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_blck_dnst_plot_png - label: "Blacklist regions fraction, raw" + label: "Distribution of ATAC fragments within genomic blacklist regions per cell (unfiltered)" doc: | - Fraction of ATAC fragments within - genomic blacklist regions per cell - density for raw data + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "Blacklist regions fraction" + tab: "Unfiltered" + Caption: "Distribution of ATAC fragments within genomic blacklist regions per cell (unfiltered)" raw_rna_atac_cnts_plot_png: type: File? outputSource: sc_multiome_filter/raw_rna_atac_cnts_plot_png - label: "RNA reads vs ATAC fragments in peaks, raw" + label: "RNA reads vs ATAC fragments in peaks per cell (unfiltered)" doc: | - RNA reads vs ATAC fragments in - peaks per cell for raw data + RNA reads vs ATAC fragments + in peaks per cell. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "RNA reads vs ATAC fragments in peaks" + tab: "Unfiltered" + Caption: "RNA reads vs ATAC fragments in peaks per cell (unfiltered)" raw_tss_frgm_plot_png: type: File? outputSource: sc_multiome_filter/raw_tss_frgm_plot_png - label: "TSS enrichment score vs ATAC fragments in peaks, raw" + label: "TSS enrichment score vs ATAC fragments in peaks per cell (unfiltered)" doc: | - TSS enrichment score vs ATAC fragments - in peaks per cell for raw data + TSS enrichment score vs ATAC + fragments in peaks per cell. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "TSS enrichment score vs ATAC fragments in peaks" + tab: "Unfiltered" + Caption: "TSS enrichment score vs ATAC fragments in peaks per cell (unfiltered)" raw_qc_mtrcs_dnst_plot_png: type: File? outputSource: sc_multiome_filter/raw_qc_mtrcs_dnst_plot_png - label: "Main QC metrics, raw" + label: "Distribution of QC metrics per cell (unfiltered)" doc: | - Main QC metrics per cell densities - for raw data + Distribution of QC metrics per cell. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "Main QC metrics" + tab: "Unfiltered" + Caption: "Distribution of QC metrics per cell (unfiltered)" raw_rnadbl_plot_png: type: File? outputSource: sc_multiome_filter/raw_rnadbl_plot_png - label: "RNA doublets, raw" + label: "Percentage of RNA doublets (unfiltered)" doc: | - Percentage of RNA doublets per - dataset for raw data + Percentage of RNA doublets. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "RNA doublets" + tab: "Unfiltered" + Caption: "Percentage of RNA doublets (unfiltered)" raw_atacdbl_plot_png: type: File? outputSource: sc_multiome_filter/raw_atacdbl_plot_png - label: "ATAC doublets, raw" + label: "Percentage of ATAC doublets (unfiltered)" doc: | - Percentage of ATAC doublets per - dataset for raw data + Percentage of ATAC doublets. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "ATAC doublets" + tab: "Unfiltered" + Caption: "Percentage of ATAC doublets (unfiltered)" raw_vrlpdbl_plot_png: type: File? outputSource: sc_multiome_filter/raw_vrlpdbl_plot_png - label: "RNA and ATAC doublets overlap, raw" + label: "Percentage of RNA and ATAC doublets (unfiltered)" doc: | - RNA and ATAC doublets overlap per - dataset for raw data + Percentage of RNA and ATAC doublets. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "RNA and ATAC doublets overlap" + tab: "Unfiltered" + Caption: "Percentage of RNA and ATAC doublets (unfiltered)" raw_tss_nrch_plot_png: type: File? outputSource: sc_multiome_filter/raw_tss_nrch_plot_png - label: "TSS enrichment, raw" + label: "Signal enrichment around TSS (unfiltered, split by the minimum TSS enrichment score threshold)" doc: | - TSS enrichment score - for raw data + Signal enrichment around TSS. + Unfiltered; split by the minimum + TSS enrichment score threshold. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "TSS enrichment" + tab: "Unfiltered" + Caption: "Signal enrichment around TSS (unfiltered, split by the minimum TSS enrichment score threshold)" raw_frgm_hist_png: type: File? outputSource: sc_multiome_filter/raw_frgm_hist_png - label: "ATAC fragments length, raw" + label: "Histogram of ATAC fragment length (unfiltered, split by the maximum nucleosome signal threshold)" doc: | - ATAC fragments length distribution - for raw data + Histogram of ATAC fragment length. + Unfiltered; split by the maximum + nucleosome signal threshold. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "ATAC fragments length" + tab: "Unfiltered" + Caption: "Histogram of ATAC fragment length (unfiltered, split by the maximum nucleosome signal threshold)" raw_umi_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/raw_umi_dnst_spl_cnd_plot_png - label: "RNA reads per cell, raw, split by condition" + label: "Distribution of RNA reads per cell (unfiltered, split by grouping condition)" doc: | - Split by grouping condition RNA reads - per cell density for raw data + Distribution of RNA reads per cell. + Unfiltered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Raw, by condition" - Caption: "RNA reads per cell" + tab: "Unfiltered, split by group" + Caption: "Distribution of RNA reads per cell (unfiltered, split by grouping condition)" raw_gene_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/raw_gene_dnst_spl_cnd_plot_png - label: "Genes per cell, raw, split by condition" + label: "Distribution of genes per cell (unfiltered, split by grouping condition)" doc: | - Split by grouping condition genes - per cell for raw data + Distribution of genes per cell. + Unfiltered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Raw, by condition" - Caption: "Genes per cell" + tab: "Unfiltered, split by group" + Caption: "Distribution of genes per cell (unfiltered, split by grouping condition)" raw_mito_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/raw_mito_dnst_spl_cnd_plot_png - label: "Mitochondrial percentage, raw, split by condition" + label: "Distribution of RNA reads mapped to mitochondrial genes per cell (unfiltered, split by grouping condition)" doc: | - Split by grouping condition the - percentage of RNA reads mapped to - mitochondrial genes per cell density - for raw data + Distribution of RNA reads mapped + to mitochondrial genes per cell. + Unfiltered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Raw, by condition" - Caption: "Mitochondrial percentage" + tab: "Unfiltered, split by group" + Caption: "Distribution of RNA reads mapped to mitochondrial genes per cell (unfiltered, split by grouping condition)" raw_nvlt_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/raw_nvlt_dnst_spl_cnd_plot_png - label: "Novelty score, raw, split by condition" + label: "Distribution of novelty score per cell (unfiltered, split by grouping condition)" doc: | - Split by grouping condition the - novelty score per cell density - for raw data + Distribution of novelty score per cell. + Unfiltered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Raw, by condition" - Caption: "Novelty score" + tab: "Unfiltered, split by group" + Caption: "Distribution of novelty score per cell (unfiltered, split by grouping condition)" raw_frgm_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/raw_frgm_dnst_spl_cnd_plot_png - label: "ATAC fragments in peaks per cell, raw, split by condition" + label: "Distribution of ATAC fragments in peaks per cell (unfiltered, split by grouping condition)" doc: | - Split by grouping condition ATAC - fragments in peaks per cell density - for raw data + Distribution of ATAC fragments in peaks + per cell. + Unfiltered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Raw, by condition" - Caption: "ATAC fragments in peaks per cell" + tab: "Unfiltered, split by group" + Caption: "Distribution of ATAC fragments in peaks per cell (unfiltered, split by grouping condition)" raw_peak_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/raw_peak_dnst_spl_cnd_plot_png - label: "Peaks per cell, raw, split by condition" + label: "Distribution of peaks per cell (unfiltered, split by grouping condition)" doc: | - Split by grouping condition peaks - per cell for raw data + Distribution of peaks per cell. + Unfiltered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Raw, by condition" - Caption: "Peaks per cell" + tab: "Unfiltered, split by group" + Caption: "Distribution of peaks per cell (unfiltered, split by grouping condition)" raw_blck_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/raw_blck_dnst_spl_cnd_plot_png - label: "Blacklist regions fraction, raw, split by condition" + label: "Distribution of ATAC fragments within genomic blacklist regions per cell (unfiltered, split by grouping condition)" doc: | - Split by grouping condition the - fraction of ATAC fragments within - genomic blacklist regions per cell - density for raw data + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Unfiltered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Raw, by condition" - Caption: "Blacklist regions fraction" + tab: "Unfiltered, split by group" + Caption: "Distribution of ATAC fragments within genomic blacklist regions per cell (unfiltered, split by grouping condition)" fltr_1_2_qc_mtrcs_pca_plot_png: type: File? outputSource: sc_multiome_filter/fltr_1_2_qc_mtrcs_pca_plot_png - label: "QC metrics PCA (1,2), filtered" + label: "QC metrics PCA (filtered, PC1/PC2)" doc: | - PC1 and PC2 from the QC metrics - PCA for filtered data + QC metrics PCA. + Filtered; PC1/PC2. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "QC metrics PCA (1,2)" + Caption: "QC metrics PCA (filtered, PC1/PC2)" fltr_2_3_qc_mtrcs_pca_plot_png: type: File? outputSource: sc_multiome_filter/fltr_2_3_qc_mtrcs_pca_plot_png - label: "QC metrics PCA (2,3), filtered" + label: "QC metrics PCA (filtered, PC2/PC3)" doc: | - PC2 and PC3 from the QC metrics - PCA for filtered data + QC metrics PCA. + Filtered; PC2/PC3. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "QC metrics PCA (2,3)" + Caption: "QC metrics PCA (filtered, PC2/PC3)" fltr_cells_count_plot_png: type: File? outputSource: sc_multiome_filter/fltr_cells_count_plot_png - label: "Cells per dataset, filtered" + label: "Number of cells per dataset (filtered)" doc: | - Number of cells per dataset - for filtered data + Number of cells per dataset. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Cells per dataset" + Caption: "Number of cells per dataset (filtered)" fltr_umi_dnst_plot_png: type: File? outputSource: sc_multiome_filter/fltr_umi_dnst_plot_png - label: "RNA reads per cell, filtered" + label: "Distribution of RNA reads per cell (filtered)" doc: | - RNA reads per cell density - for filtered data + Distribution of RNA reads per cell. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "RNA reads per cell" + Caption: "Distribution of RNA reads per cell (filtered)" fltr_gene_dnst_plot_png: type: File? outputSource: sc_multiome_filter/fltr_gene_dnst_plot_png - label: "Genes per cell, filtered" + label: "Distribution of genes per cell (filtered)" doc: | - Genes per cell density - for filtered data + Distribution of genes per cell. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Genes per cell" + Caption: "Distribution of genes per cell (filtered)" fltr_gene_umi_plot_png: type: File? outputSource: sc_multiome_filter/fltr_gene_umi_plot_png - label: "Genes vs RNA reads, filtered" + label: "Genes vs RNA reads per cell (filtered)" doc: | - Genes vs RNA reads per cell - for filtered data + Genes vs RNA reads per cell. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Genes vs RNA reads" + Caption: "Genes vs RNA reads per cell (filtered)" fltr_umi_mito_plot_png: type: File? outputSource: sc_multiome_filter/fltr_umi_mito_plot_png - label: "RNA reads vs mitochondrial %, filtered" + label: "RNA reads vs mitochondrial percentage per cell (filtered)" doc: | - RNA reads vs mitochondrial % per cell - for filtered data + RNA reads vs mitochondrial percentage + per cell. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "RNA reads vs mitochondrial %, filtered" + Caption: "RNA reads vs mitochondrial percentage per cell (filtered)" fltr_mito_dnst_plot_png: type: File? outputSource: sc_multiome_filter/fltr_mito_dnst_plot_png - label: "Mitochondrial percentage, filtered" + label: "Distribution of RNA reads mapped to mitochondrial genes per cell (filtered)" doc: | - Percentage of RNA reads mapped to - mitochondrial genes per cell density - for filtered data + Distribution of RNA reads mapped + to mitochondrial genes per cell. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Mitochondrial percentage" + Caption: "Distribution of RNA reads mapped to mitochondrial genes per cell (filtered)" fltr_nvlt_dnst_plot_png: type: File? outputSource: sc_multiome_filter/fltr_nvlt_dnst_plot_png - label: "Novelty score, filtered" + label: "Distribution of novelty score per cell (filtered)" doc: | - Novelty score per cell density - for filtered data + Distribution of novelty score per cell. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Novelty score" + Caption: "Distribution of novelty score per cell (filtered)" fltr_frgm_dnst_plot_png: type: File? outputSource: sc_multiome_filter/fltr_frgm_dnst_plot_png - label: "ATAC fragments in peaks per cell, filtered" + label: "Distribution of ATAC fragments in peaks per cell (filtered)" doc: | - ATAC fragments in peaks per cell - density for filtered data + Distribution of ATAC fragments in peaks + per cell. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "ATAC fragments in peaks per cell" + Caption: "Distribution of ATAC fragments in peaks per cell (filtered)" fltr_peak_dnst_plot_png: type: File? outputSource: sc_multiome_filter/fltr_peak_dnst_plot_png - label: "Peaks per cell, filtered" + label: "Distribution of peaks per cell (filtered)" doc: | - Peaks per cell density - for filtered data + Distribution of peaks per cell. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Peaks per cell" + Caption: "Distribution of peaks per cell (filtered)" fltr_blck_dnst_plot_png: type: File? outputSource: sc_multiome_filter/fltr_blck_dnst_plot_png - label: "Blacklist regions fraction, filtered" + label: "Distribution of ATAC fragments within genomic blacklist regions per cell (filtered)" doc: | - Fraction of ATAC fragments within - genomic blacklist regions per cell - density for filtered data + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Blacklist regions fraction" + Caption: "Distribution of ATAC fragments within genomic blacklist regions per cell (filtered)" fltr_rna_atac_cnts_plot_png: type: File? outputSource: sc_multiome_filter/fltr_rna_atac_cnts_plot_png - label: "RNA reads vs ATAC fragments in peaks, filtered" + label: "RNA reads vs ATAC fragments in peaks per cell (filtered)" doc: | - RNA reads vs ATAC fragments in - peaks per cell for filtered data + RNA reads vs ATAC fragments + in peaks per cell. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "RNA reads vs ATAC fragments in peaks" + Caption: "RNA reads vs ATAC fragments in peaks per cell (filtered)" fltr_tss_frgm_plot_png: type: File? outputSource: sc_multiome_filter/fltr_tss_frgm_plot_png - label: "TSS enrichment score vs ATAC fragments in peaks, filtered" + label: "TSS enrichment score vs ATAC fragments in peaks per cell (filtered)" doc: | TSS enrichment score vs ATAC - fragments in peaks per cell for - filtered data + fragments in peaks per cell. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "TSS enrichment score vs ATAC fragments in peaks" + Caption: "TSS enrichment score vs ATAC fragments in peaks per cell (filtered)" fltr_qc_mtrcs_dnst_plot_png: type: File? outputSource: sc_multiome_filter/fltr_qc_mtrcs_dnst_plot_png - label: "Main QC metrics, filtered" + label: "Distribution of QC metrics per cell (filtered)" doc: | - Main QC metrics per cell densities - for filtered data + Distribution of QC metrics per cell. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Main QC metrics" + Caption: "Distribution of QC metrics per cell (filtered)" fltr_rnadbl_plot_png: type: File? outputSource: sc_multiome_filter/fltr_rnadbl_plot_png - label: "RNA doublets, filtered" + label: "Percentage of RNA doublets (filtered)" doc: | - Percentage of RNA doublets per - dataset for filtered data + Percentage of RNA doublets. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "RNA doublets" + Caption: "Percentage of RNA doublets (filtered)" fltr_atacdbl_plot_png: type: File? outputSource: sc_multiome_filter/fltr_atacdbl_plot_png - label: "ATAC doublets, filtered" + label: "Percentage of ATAC doublets (filtered)" doc: | - Percentage of ATAC doublets per - dataset for filtered data + Percentage of ATAC doublets. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "ATAC doublets" + Caption: "Percentage of ATAC doublets (filtered)" fltr_vrlpdbl_plot_png: type: File? outputSource: sc_multiome_filter/fltr_vrlpdbl_plot_png - label: "RNA and ATAC doublets overlap, filtered" + label: "Percentage of RNA and ATAC doublets (filtered)" doc: | - RNA and ATAC doublets overlap per - dataset for filtered data + Percentage of RNA and ATAC doublets. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "RNA and ATAC doublets overlap" + Caption: "Percentage of RNA and ATAC doublets (filtered)" fltr_tss_nrch_plot_png: type: File? outputSource: sc_multiome_filter/fltr_tss_nrch_plot_png - label: "TSS enrichment, filtered" + label: "Signal enrichment around TSS (filtered, split by the minimum TSS enrichment score threshold)" doc: | - TSS enrichment score - for filtered data + Signal enrichment around TSS. + Filtered; split by the minimum + TSS enrichment score threshold. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "TSS enrichment" + Caption: "Signal enrichment around TSS (filtered, split by the minimum TSS enrichment score threshold)" fltr_frgm_hist_png: type: File? outputSource: sc_multiome_filter/fltr_frgm_hist_png - label: "ATAC fragments length, filtered" + label: "Histogram of ATAC fragment length (filtered, split by the maximum nucleosome signal threshold)" doc: | - ATAC fragments length distribution - for filtered data + Histogram of ATAC fragment length. + Filtered; split by the maximum + nucleosome signal threshold. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "ATAC fragments length" + Caption: "Histogram of ATAC fragment length (filtered, split by the maximum nucleosome signal threshold)" fltr_umi_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/fltr_umi_dnst_spl_cnd_plot_png - label: "RNA reads per cell, filtered, split by condition" + label: "Distribution of RNA reads per cell (filtered, split by grouping condition)" doc: | - Split by grouping condition RNA reads - per cell density for filtered data + Distribution of RNA reads per cell. + Filtered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Filtered, by condition" - Caption: "RNA reads per cell" + tab: "Filtered, split by group" + Caption: "Distribution of RNA reads per cell (filtered, split by grouping condition)" fltr_gene_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/fltr_gene_dnst_spl_cnd_plot_png - label: "Genes per cell, filtered, split by condition" + label: "Distribution of genes per cell (filtered, split by grouping condition)" doc: | - Split by grouping condition genes - per cell for filtered data + Distribution of genes per cell. + Filtered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Filtered, by condition" - Caption: "Genes per cell" + tab: "Filtered, split by group" + Caption: "Distribution of genes per cell (filtered, split by grouping condition)" fltr_mito_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/fltr_mito_dnst_spl_cnd_plot_png - label: "Mitochondrial percentage, filtered, split by condition" + label: "Distribution of RNA reads mapped to mitochondrial genes per cell (filtered, split by grouping condition)" doc: | - Split by grouping condition the - percentage of RNA reads mapped to - mitochondrial genes per cell density - for filtered data + Distribution of RNA reads mapped + to mitochondrial genes per cell. + Filtered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Filtered, by condition" - Caption: "Mitochondrial percentage" + tab: "Filtered, split by group" + Caption: "Distribution of RNA reads mapped to mitochondrial genes per cell (filtered, split by grouping condition)" fltr_nvlt_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/fltr_nvlt_dnst_spl_cnd_plot_png - label: "Novelty score, filtered, split by condition" + label: "Distribution of novelty score per cell (filtered, split by grouping condition)" doc: | - Split by grouping condition the - novelty score per cell density - for filtered data + Distribution of novelty score per cell. + Filtered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Filtered, by condition" - Caption: "Novelty score" + tab: "Filtered, split by group" + Caption: "Distribution of novelty score per cell (filtered, split by grouping condition)" fltr_frgm_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/fltr_frgm_dnst_spl_cnd_plot_png - label: "ATAC fragments in peaks per cell, filtered, split by condition" + label: "Distribution of ATAC fragments in peaks per cell (filtered, split by grouping condition)" doc: | - Split by grouping condition ATAC - fragments in peaks per cell density - for filtered data + Distribution of ATAC fragments in peaks + per cell. + Filtered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Filtered, by condition" - Caption: "ATAC fragments in peaks per cell" + tab: "Filtered, split by group" + Caption: "Distribution of ATAC fragments in peaks per cell (filtered, split by grouping condition)" fltr_peak_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/fltr_peak_dnst_spl_cnd_plot_png - label: "Peaks per cell, filtered, split by condition" + label: "Distribution of peaks per cell (filtered, split by grouping condition)" doc: | - Split by grouping condition peaks - per cell for filtered data + Distribution of peaks per cell. + Filtered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Filtered, by condition" - Caption: "Peaks per cell" + tab: "Filtered, split by group" + Caption: "Distribution of peaks per cell (filtered, split by grouping condition)" fltr_blck_dnst_spl_cnd_plot_png: type: File? outputSource: sc_multiome_filter/fltr_blck_dnst_spl_cnd_plot_png - label: "Blacklist regions fraction, filtered, split by condition" + label: "Distribution of ATAC fragments within genomic blacklist regions per cell (filtered, split by grouping condition)" doc: | - Split by grouping condition the - fraction of ATAC fragments within - genomic blacklist regions per cell - density for filtered data + Distribution of ATAC fragments within + genomic blacklist regions per cell. + Filtered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Filtered, by condition" - Caption: "Blacklist regions fraction" + tab: "Filtered, split by group" + Caption: "Distribution of ATAC fragments within genomic blacklist regions per cell (filtered, split by grouping condition)" ucsc_cb_html_data: type: Directory @@ -1268,9 +1315,7 @@ steps: minimum_tss_enrich: source: minimum_tss_enrich valueFrom: $(split_numbers(self)) - minimum_frip: - source: minimum_frip - valueFrom: $(split_numbers(self)) + minimum_frip: minimum_frip maximum_blacklist_fraction: source: maximum_blacklist_fraction valueFrom: $(split_numbers(self)) @@ -1359,60 +1404,7 @@ steps: - fltr_frgm_dnst_spl_cnd_plot_png - fltr_peak_dnst_spl_cnd_plot_png - fltr_blck_dnst_spl_cnd_plot_png - - raw_1_2_qc_mtrcs_pca_plot_pdf - - raw_2_3_qc_mtrcs_pca_plot_pdf - - raw_cells_count_plot_pdf - - raw_umi_dnst_plot_pdf - - raw_gene_dnst_plot_pdf - - raw_gene_umi_plot_pdf - - raw_umi_mito_plot_pdf - - raw_mito_dnst_plot_pdf - - raw_nvlt_dnst_plot_pdf - - raw_frgm_dnst_plot_pdf - - raw_peak_dnst_plot_pdf - - raw_blck_dnst_plot_pdf - - raw_rna_atac_cnts_plot_pdf - - raw_tss_frgm_plot_pdf - - raw_qc_mtrcs_dnst_plot_pdf - - raw_rnadbl_plot_pdf - - raw_atacdbl_plot_pdf - - raw_vrlpdbl_plot_pdf - - raw_tss_nrch_plot_pdf - - raw_frgm_hist_pdf - - raw_umi_dnst_spl_cnd_plot_pdf - - raw_gene_dnst_spl_cnd_plot_pdf - - raw_mito_dnst_spl_cnd_plot_pdf - - raw_nvlt_dnst_spl_cnd_plot_pdf - - raw_frgm_dnst_spl_cnd_plot_pdf - - raw_peak_dnst_spl_cnd_plot_pdf - - raw_blck_dnst_spl_cnd_plot_pdf - - fltr_1_2_qc_mtrcs_pca_plot_pdf - - fltr_2_3_qc_mtrcs_pca_plot_pdf - - fltr_cells_count_plot_pdf - - fltr_umi_dnst_plot_pdf - - fltr_gene_dnst_plot_pdf - - fltr_gene_umi_plot_pdf - - fltr_umi_mito_plot_pdf - - fltr_mito_dnst_plot_pdf - - fltr_nvlt_dnst_plot_pdf - - fltr_frgm_dnst_plot_pdf - - fltr_peak_dnst_plot_pdf - - fltr_blck_dnst_plot_pdf - - fltr_rna_atac_cnts_plot_pdf - - fltr_rnadbl_plot_pdf - - fltr_atacdbl_plot_pdf - - fltr_vrlpdbl_plot_pdf - - fltr_tss_frgm_plot_pdf - - fltr_qc_mtrcs_dnst_plot_pdf - - fltr_tss_nrch_plot_pdf - - fltr_frgm_hist_pdf - - fltr_umi_dnst_spl_cnd_plot_pdf - - fltr_gene_dnst_spl_cnd_plot_pdf - - fltr_mito_dnst_spl_cnd_plot_pdf - - fltr_nvlt_dnst_spl_cnd_plot_pdf - - fltr_frgm_dnst_spl_cnd_plot_pdf - - fltr_peak_dnst_spl_cnd_plot_pdf - - fltr_blck_dnst_spl_cnd_plot_pdf + - all_plots_pdf - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds @@ -1425,60 +1417,7 @@ steps: in: input_files: source: - - sc_multiome_filter/raw_1_2_qc_mtrcs_pca_plot_pdf - - sc_multiome_filter/raw_2_3_qc_mtrcs_pca_plot_pdf - - sc_multiome_filter/raw_cells_count_plot_pdf - - sc_multiome_filter/raw_umi_dnst_plot_pdf - - sc_multiome_filter/raw_gene_dnst_plot_pdf - - sc_multiome_filter/raw_gene_umi_plot_pdf - - sc_multiome_filter/raw_umi_mito_plot_pdf - - sc_multiome_filter/raw_mito_dnst_plot_pdf - - sc_multiome_filter/raw_nvlt_dnst_plot_pdf - - sc_multiome_filter/raw_frgm_dnst_plot_pdf - - sc_multiome_filter/raw_peak_dnst_plot_pdf - - sc_multiome_filter/raw_blck_dnst_plot_pdf - - sc_multiome_filter/raw_rna_atac_cnts_plot_pdf - - sc_multiome_filter/raw_tss_frgm_plot_pdf - - sc_multiome_filter/raw_qc_mtrcs_dnst_plot_pdf - - sc_multiome_filter/raw_rnadbl_plot_pdf - - sc_multiome_filter/raw_atacdbl_plot_pdf - - sc_multiome_filter/raw_vrlpdbl_plot_pdf - - sc_multiome_filter/raw_tss_nrch_plot_pdf - - sc_multiome_filter/raw_frgm_hist_pdf - - sc_multiome_filter/raw_umi_dnst_spl_cnd_plot_pdf - - sc_multiome_filter/raw_gene_dnst_spl_cnd_plot_pdf - - sc_multiome_filter/raw_mito_dnst_spl_cnd_plot_pdf - - sc_multiome_filter/raw_nvlt_dnst_spl_cnd_plot_pdf - - sc_multiome_filter/raw_frgm_dnst_spl_cnd_plot_pdf - - sc_multiome_filter/raw_peak_dnst_spl_cnd_plot_pdf - - sc_multiome_filter/raw_blck_dnst_spl_cnd_plot_pdf - - sc_multiome_filter/fltr_1_2_qc_mtrcs_pca_plot_pdf - - sc_multiome_filter/fltr_2_3_qc_mtrcs_pca_plot_pdf - - sc_multiome_filter/fltr_cells_count_plot_pdf - - sc_multiome_filter/fltr_umi_dnst_plot_pdf - - sc_multiome_filter/fltr_gene_dnst_plot_pdf - - sc_multiome_filter/fltr_gene_umi_plot_pdf - - sc_multiome_filter/fltr_umi_mito_plot_pdf - - sc_multiome_filter/fltr_mito_dnst_plot_pdf - - sc_multiome_filter/fltr_nvlt_dnst_plot_pdf - - sc_multiome_filter/fltr_frgm_dnst_plot_pdf - - sc_multiome_filter/fltr_peak_dnst_plot_pdf - - sc_multiome_filter/fltr_blck_dnst_plot_pdf - - sc_multiome_filter/fltr_rna_atac_cnts_plot_pdf - - sc_multiome_filter/fltr_rnadbl_plot_pdf - - sc_multiome_filter/fltr_atacdbl_plot_pdf - - sc_multiome_filter/fltr_vrlpdbl_plot_pdf - - sc_multiome_filter/fltr_tss_frgm_plot_pdf - - sc_multiome_filter/fltr_qc_mtrcs_dnst_plot_pdf - - sc_multiome_filter/fltr_tss_nrch_plot_pdf - - sc_multiome_filter/fltr_frgm_hist_pdf - - sc_multiome_filter/fltr_umi_dnst_spl_cnd_plot_pdf - - sc_multiome_filter/fltr_gene_dnst_spl_cnd_plot_pdf - - sc_multiome_filter/fltr_mito_dnst_spl_cnd_plot_pdf - - sc_multiome_filter/fltr_nvlt_dnst_spl_cnd_plot_pdf - - sc_multiome_filter/fltr_frgm_dnst_spl_cnd_plot_pdf - - sc_multiome_filter/fltr_peak_dnst_spl_cnd_plot_pdf - - sc_multiome_filter/fltr_blck_dnst_spl_cnd_plot_pdf + - sc_multiome_filter/all_plots_pdf valueFrom: $(self.flat().filter(n => n)) folder_basename: default: "pdf_plots" @@ -1501,7 +1440,7 @@ $schemas: label: "Single-Cell Multiome ATAC-Seq and RNA-Seq Filtering Analysis" s:name: "Single-Cell Multiome ATAC-Seq and RNA-Seq Filtering Analysis" -s:alternateName: "Filters single-cell multiome ATAC and RNA-Seq datasets based on the common QC metrics" +s:alternateName: "Single-Cell Multiome ATAC-Seq and RNA-Seq Filtering Analysis" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-multiome-filter.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -1541,8 +1480,8 @@ s:creator: doc: | Single-Cell Multiome ATAC-Seq and RNA-Seq Filtering Analysis - Removes low-quality cells from the outputs of “Cell Ranger Count + Removes low-quality cells from the outputs of the “Cell Ranger Count (RNA+ATAC)” and “Cell Ranger Aggregate (RNA+ATAC)” pipelines. The - results of this workflow are primarily used in “Single-Cell RNA-Seq + results of this workflow are used in the “Single-Cell RNA-Seq Dimensionality Reduction Analysis” and “Single-Cell ATAC-Seq Dimensionality Reduction Analysis” pipelines. \ No newline at end of file diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index 066bd100..4aeef01f 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -285,370 +285,398 @@ outputs: raw_1_2_qc_mtrcs_pca_plot_png: type: File? outputSource: sc_rna_filter/raw_1_2_qc_mtrcs_pca_plot_png - label: "QC metrics PCA (1,2), raw" + label: "QC metrics PCA (unfiltered, PC1/PC2)" doc: | - PC1 and PC2 from the QC metrics - PCA for raw data + QC metrics PCA. + Unfiltered; PC1/PC2. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "QC metrics PCA (1,2)" + tab: "Unfiltered" + Caption: "QC metrics PCA (unfiltered, PC1/PC2)" raw_2_3_qc_mtrcs_pca_plot_png: type: File? outputSource: sc_rna_filter/raw_2_3_qc_mtrcs_pca_plot_png - label: "QC metrics PCA (2,3), raw" + label: "QC metrics PCA (unfiltered, PC2/PC3)" doc: | - PC2 and PC3 from the QC metrics - PCA for raw data + QC metrics PCA. + Unfiltered; PC2/PC3. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "QC metrics PCA (2,3)" + tab: "Unfiltered" + Caption: "QC metrics PCA (unfiltered, PC2/PC3)" raw_cells_count_plot_png: type: File? outputSource: sc_rna_filter/raw_cells_count_plot_png - label: "Cells per dataset, raw" + label: "Number of cells per dataset (unfiltered)" doc: | - Number of cells per dataset - for raw data + Number of cells per dataset. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "Cells per dataset" + tab: "Unfiltered" + Caption: "Number of cells per dataset (unfiltered)" raw_umi_dnst_plot_png: type: File? outputSource: sc_rna_filter/raw_umi_dnst_plot_png - label: "RNA reads per cell, raw" + label: "Distribution of RNA reads per cell (unfiltered)" doc: | - RNA reads per cell density - for raw data + Distribution of RNA reads per cell. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "RNA reads per cell" + tab: "Unfiltered" + Caption: "Distribution of RNA reads per cell (unfiltered)" raw_gene_dnst_plot_png: type: File? outputSource: sc_rna_filter/raw_gene_dnst_plot_png - label: "Genes per cell, raw" + label: "Distribution of genes per cell (unfiltered)" doc: | - Genes per cell density - for raw data + Distribution of genes per cell. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "Genes per cell" + tab: "Unfiltered" + Caption: "Distribution of genes per cell (unfiltered)" raw_gene_umi_plot_png: type: File? outputSource: sc_rna_filter/raw_gene_umi_plot_png - label: "Genes vs RNA reads, raw" + label: "Genes vs RNA reads per cell (unfiltered)" doc: | - Genes vs RNA reads per cell - for raw data + Genes vs RNA reads per cell. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "Genes vs RNA reads" + tab: "Unfiltered" + Caption: "Genes vs RNA reads per cell (unfiltered)" raw_umi_mito_plot_png: type: File? outputSource: sc_rna_filter/raw_umi_mito_plot_png - label: "RNA reads vs mitochondrial %, raw" + label: "RNA reads vs mitochondrial percentage per cell (unfiltered)" doc: | - RNA reads vs mitochondrial % per cell - for raw data + RNA reads vs mitochondrial percentage + per cell. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "RNA reads vs mitochondrial %" + tab: "Unfiltered" + Caption: "RNA reads vs mitochondrial percentage per cell (unfiltered)" raw_mito_dnst_plot_png: type: File? outputSource: sc_rna_filter/raw_mito_dnst_plot_png - label: "Mitochondrial percentage, raw" + label: "Distribution of RNA reads mapped to mitochondrial genes per cell (unfiltered)" doc: | - Percentage of RNA reads mapped to - mitochondrial genes per cell density - for raw data + Distribution of RNA reads mapped + to mitochondrial genes per cell. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "Mitochondrial percentage" + tab: "Unfiltered" + Caption: "Distribution of RNA reads mapped to mitochondrial genes per cell (unfiltered)" raw_nvlt_dnst_plot_png: type: File? outputSource: sc_rna_filter/raw_nvlt_dnst_plot_png - label: "Novelty score, raw" + label: "Distribution of novelty score per cell (unfiltered)" doc: | - Novelty score per cell density - for raw data + Distribution of novelty score per cell. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "Novelty score" + tab: "Unfiltered" + Caption: "Distribution of novelty score per cell (unfiltered)" raw_qc_mtrcs_dnst_plot_png: type: File? outputSource: sc_rna_filter/raw_qc_mtrcs_dnst_plot_png - label: "Main QC metrics, raw" + label: "Distribution of QC metrics per cell (unfiltered)" doc: | - Main QC metrics per cell densities - for raw data + Distribution of QC metrics per cell. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "Main QC metrics" + tab: "Unfiltered" + Caption: "Distribution of QC metrics per cell (unfiltered)" raw_rnadbl_plot_png: type: File? outputSource: sc_rna_filter/raw_rnadbl_plot_png - label: "RNA doublets, raw" + label: "Percentage of RNA doublets (unfiltered)" doc: | - Percentage of RNA doublets per - dataset for raw data + Percentage of RNA doublets. + Unfiltered. + PNG format. "sd:visualPlugins": - image: - tab: "Raw" - Caption: "RNA doublets" + tab: "Unfiltered" + Caption: "Percentage of RNA doublets (unfiltered)" raw_umi_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/raw_umi_dnst_spl_cnd_plot_png - label: "RNA reads per cell, raw, split by condition" + label: "Distribution of RNA reads per cell (unfiltered, split by grouping condition)" doc: | - Split by grouping condition RNA reads - per cell density for raw data + Distribution of RNA reads per cell. + Unfiltered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Raw, by condition" - Caption: "RNA reads per cell" + tab: "Unfiltered, split by group" + Caption: "Distribution of RNA reads per cell (unfiltered, split by grouping condition)" raw_gene_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/raw_gene_dnst_spl_cnd_plot_png - label: "Genes per cell, raw, split by condition" + label: "Distribution of genes per cell (unfiltered, split by grouping condition)" doc: | - Split by grouping condition genes - per cell for raw data + Distribution of genes per cell. + Unfiltered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Raw, by condition" - Caption: "Genes per cell" + tab: "Unfiltered, split by group" + Caption: "Distribution of genes per cell (unfiltered, split by grouping condition)" raw_mito_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/raw_mito_dnst_spl_cnd_plot_png - label: "Mitochondrial percentage, raw, split by condition" + label: "Distribution of RNA reads mapped to mitochondrial genes per cell (unfiltered, split by grouping condition)" doc: | - Split by grouping condition the - percentage of RNA reads mapped to - mitochondrial genes per cell density - for raw data + Distribution of RNA reads mapped + to mitochondrial genes per cell. + Unfiltered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Raw, by condition" - Caption: "Mitochondrial percentage" + tab: "Unfiltered, split by group" + Caption: "Distribution of RNA reads mapped to mitochondrial genes per cell (unfiltered, split by grouping condition)" raw_nvlt_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/raw_nvlt_dnst_spl_cnd_plot_png - label: "Novelty score, raw, split by condition" + label: "Distribution of novelty score per cell (unfiltered, split by grouping condition)" doc: | - Split by grouping condition the - novelty score per cell density - for raw data + Distribution of novelty score per cell. + Unfiltered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Raw, by condition" - Caption: "Novelty score" + tab: "Unfiltered, split by group" + Caption: "Distribution of novelty score per cell (unfiltered, split by grouping condition)" fltr_1_2_qc_mtrcs_pca_plot_png: type: File? outputSource: sc_rna_filter/fltr_1_2_qc_mtrcs_pca_plot_png - label: "QC metrics PCA (1,2), filtered" + label: "QC metrics PCA (filtered, PC1/PC2)" doc: | - PC1 and PC2 from the QC metrics - PCA for filtered data + QC metrics PCA. + Filtered; PC1/PC2. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "QC metrics PCA (1,2)" + Caption: "QC metrics PCA (filtered, PC1/PC2)" fltr_2_3_qc_mtrcs_pca_plot_png: type: File? outputSource: sc_rna_filter/fltr_2_3_qc_mtrcs_pca_plot_png - label: "QC metrics PCA (2,3), filtered" + label: "QC metrics PCA (filtered, PC2/PC3)" doc: | - PC2 and PC3 from the QC metrics - PCA for filtered data + QC metrics PCA. + Filtered; PC2/PC3. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "QC metrics PCA (2,3)" + Caption: "QC metrics PCA (filtered, PC2/PC3)" fltr_cells_count_plot_png: type: File? outputSource: sc_rna_filter/fltr_cells_count_plot_png - label: "Cells per dataset, filtered" + label: "Number of cells per dataset (filtered)" doc: | - Number of cells per dataset - for filtered data + Number of cells per dataset. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Cells per dataset" + Caption: "Number of cells per dataset (filtered)" fltr_umi_dnst_plot_png: type: File? outputSource: sc_rna_filter/fltr_umi_dnst_plot_png - label: "RNA reads per cell, filtered" + label: "Distribution of RNA reads per cell (filtered)" doc: | - RNA reads per cell density - for filtered data + Distribution of RNA reads per cell. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "RNA reads per cell" + Caption: "Distribution of RNA reads per cell (filtered)" fltr_gene_dnst_plot_png: type: File? outputSource: sc_rna_filter/fltr_gene_dnst_plot_png - label: "Genes per cell, filtered" + label: "Distribution of genes per cell (filtered)" doc: | - Genes per cell density - for filtered data + Distribution of genes per cell. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Genes per cell" + Caption: "Distribution of genes per cell (filtered)" fltr_gene_umi_plot_png: type: File? outputSource: sc_rna_filter/fltr_gene_umi_plot_png - label: "RNA reads vs mitochondrial %, filtered" + label: "Genes vs RNA reads per cell (filtered)" doc: | - RNA reads vs mitochondrial % per cell - for filtered data + Genes vs RNA reads per cell. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "RNA reads vs mitochondrial %" + Caption: "Genes vs RNA reads per cell (filtered)" fltr_umi_mito_plot_png: type: File? outputSource: sc_rna_filter/fltr_umi_mito_plot_png - label: "Genes vs RNA reads, filtered" + label: "RNA reads vs mitochondrial percentage per cell (filtered)" doc: | - Genes vs RNA reads per cell - for filtered data + RNA reads vs mitochondrial percentage + per cell. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Genes vs RNA reads" + Caption: "RNA reads vs mitochondrial percentage per cell (filtered)" fltr_mito_dnst_plot_png: type: File? outputSource: sc_rna_filter/fltr_mito_dnst_plot_png - label: "Mitochondrial percentage, filtered" + label: "Distribution of RNA reads mapped to mitochondrial genes per cell (filtered)" doc: | - Percentage of RNA reads mapped to - mitochondrial genes per cell density - for filtered data + Distribution of RNA reads mapped + to mitochondrial genes per cell. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Mitochondrial percentage" + Caption: "Distribution of RNA reads mapped to mitochondrial genes per cell (filtered)" fltr_nvlt_dnst_plot_png: type: File? outputSource: sc_rna_filter/fltr_nvlt_dnst_plot_png - label: "Novelty score, filtered" + label: "Distribution of novelty score per cell (filtered)" doc: | - Novelty score per cell density - for filtered data + Distribution of novelty score per cell. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Novelty score" + Caption: "Distribution of novelty score per cell (filtered)" fltr_qc_mtrcs_dnst_plot_png: type: File? outputSource: sc_rna_filter/fltr_qc_mtrcs_dnst_plot_png - label: "Main QC metrics, filtered" + label: "Distribution of QC metrics per cell (filtered)" doc: | - Main QC metrics per cell densities - for filtered data + Distribution of QC metrics per cell. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "Main QC metrics" + Caption: "Distribution of QC metrics per cell (filtered)" fltr_rnadbl_plot_png: type: File? outputSource: sc_rna_filter/fltr_rnadbl_plot_png - label: "RNA doublets, filtered" + label: "Percentage of RNA doublets (filtered)" doc: | - Percentage of RNA doublets per - dataset for filtered data + Percentage of RNA doublets. + Filtered. + PNG format. "sd:visualPlugins": - image: tab: "Filtered" - Caption: "RNA doublets" + Caption: "Percentage of RNA doublets (filtered)" fltr_umi_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/fltr_umi_dnst_spl_cnd_plot_png - label: "RNA reads per cell, filtered, split by condition" + label: "Distribution of RNA reads per cell (filtered, split by grouping condition)" doc: | - Split by grouping condition RNA reads - per cell density for filtered data + Distribution of RNA reads per cell. + Filtered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Filtered, by condition" - Caption: "RNA reads per cell" + tab: "Filtered, split by group" + Caption: "Distribution of RNA reads per cell (filtered, split by grouping condition)" fltr_gene_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/fltr_gene_dnst_spl_cnd_plot_png - label: "Genes per cell, filtered, split by condition" + label: "Distribution of genes per cell (filtered, split by grouping condition)" doc: | - Split by grouping condition genes - per cell for filtered data + Distribution of genes per cell. + Filtered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Filtered, by condition" - Caption: "Genes per cell" + tab: "Filtered, split by group" + Caption: "Distribution of genes per cell (filtered, split by grouping condition)" fltr_mito_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/fltr_mito_dnst_spl_cnd_plot_png - label: "Mitochondrial percentage, filtered, split by condition" + label: "Distribution of RNA reads mapped to mitochondrial genes per cell (filtered, split by grouping condition)" doc: | - Split by grouping condition the - percentage of RNA reads mapped to - mitochondrial genes per cell density - for filtered data + Distribution of RNA reads mapped + to mitochondrial genes per cell. + Filtered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Filtered, by condition" - Caption: "Mitochondrial percentage" + tab: "Filtered, split by group" + Caption: "Distribution of RNA reads mapped to mitochondrial genes per cell (filtered, split by grouping condition)" fltr_nvlt_dnst_spl_cnd_plot_png: type: File? outputSource: sc_rna_filter/fltr_nvlt_dnst_spl_cnd_plot_png - label: "Novelty score, filtered, split by condition" + label: "Distribution of novelty score per cell (filtered, split by grouping condition)" doc: | - Split by grouping condition the - novelty score per cell density - for filtered data + Distribution of novelty score per cell. + Filtered; split by grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Filtered, by condition" - Caption: "Novelty score" + tab: "Filtered, split by group" + Caption: "Distribution of novelty score per cell (filtered, split by grouping condition)" ucsc_cb_html_data: type: Directory @@ -788,36 +816,7 @@ steps: - fltr_gene_dnst_spl_cnd_plot_png - fltr_mito_dnst_spl_cnd_plot_png - fltr_nvlt_dnst_spl_cnd_plot_png - - raw_1_2_qc_mtrcs_pca_plot_pdf - - raw_2_3_qc_mtrcs_pca_plot_pdf - - raw_cells_count_plot_pdf - - raw_umi_dnst_plot_pdf - - raw_gene_dnst_plot_pdf - - raw_gene_umi_plot_pdf - - raw_umi_mito_plot_pdf - - raw_mito_dnst_plot_pdf - - raw_nvlt_dnst_plot_pdf - - raw_qc_mtrcs_dnst_plot_pdf - - raw_rnadbl_plot_pdf - - raw_umi_dnst_spl_cnd_plot_pdf - - raw_gene_dnst_spl_cnd_plot_pdf - - raw_mito_dnst_spl_cnd_plot_pdf - - raw_nvlt_dnst_spl_cnd_plot_pdf - - fltr_1_2_qc_mtrcs_pca_plot_pdf - - fltr_2_3_qc_mtrcs_pca_plot_pdf - - fltr_cells_count_plot_pdf - - fltr_umi_dnst_plot_pdf - - fltr_gene_dnst_plot_pdf - - fltr_gene_umi_plot_pdf - - fltr_umi_mito_plot_pdf - - fltr_mito_dnst_plot_pdf - - fltr_nvlt_dnst_plot_pdf - - fltr_qc_mtrcs_dnst_plot_pdf - - fltr_rnadbl_plot_pdf - - fltr_umi_dnst_spl_cnd_plot_pdf - - fltr_gene_dnst_spl_cnd_plot_pdf - - fltr_mito_dnst_spl_cnd_plot_pdf - - fltr_nvlt_dnst_spl_cnd_plot_pdf + - all_plots_pdf - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds @@ -830,36 +829,7 @@ steps: in: input_files: source: - - sc_rna_filter/raw_1_2_qc_mtrcs_pca_plot_pdf - - sc_rna_filter/raw_2_3_qc_mtrcs_pca_plot_pdf - - sc_rna_filter/raw_cells_count_plot_pdf - - sc_rna_filter/raw_umi_dnst_plot_pdf - - sc_rna_filter/raw_gene_dnst_plot_pdf - - sc_rna_filter/raw_gene_umi_plot_pdf - - sc_rna_filter/raw_umi_mito_plot_pdf - - sc_rna_filter/raw_mito_dnst_plot_pdf - - sc_rna_filter/raw_nvlt_dnst_plot_pdf - - sc_rna_filter/raw_qc_mtrcs_dnst_plot_pdf - - sc_rna_filter/raw_rnadbl_plot_pdf - - sc_rna_filter/raw_umi_dnst_spl_cnd_plot_pdf - - sc_rna_filter/raw_gene_dnst_spl_cnd_plot_pdf - - sc_rna_filter/raw_mito_dnst_spl_cnd_plot_pdf - - sc_rna_filter/raw_nvlt_dnst_spl_cnd_plot_pdf - - sc_rna_filter/fltr_1_2_qc_mtrcs_pca_plot_pdf - - sc_rna_filter/fltr_2_3_qc_mtrcs_pca_plot_pdf - - sc_rna_filter/fltr_cells_count_plot_pdf - - sc_rna_filter/fltr_umi_dnst_plot_pdf - - sc_rna_filter/fltr_gene_dnst_plot_pdf - - sc_rna_filter/fltr_gene_umi_plot_pdf - - sc_rna_filter/fltr_umi_mito_plot_pdf - - sc_rna_filter/fltr_mito_dnst_plot_pdf - - sc_rna_filter/fltr_nvlt_dnst_plot_pdf - - sc_rna_filter/fltr_qc_mtrcs_dnst_plot_pdf - - sc_rna_filter/fltr_rnadbl_plot_pdf - - sc_rna_filter/fltr_umi_dnst_spl_cnd_plot_pdf - - sc_rna_filter/fltr_gene_dnst_spl_cnd_plot_pdf - - sc_rna_filter/fltr_mito_dnst_spl_cnd_plot_pdf - - sc_rna_filter/fltr_nvlt_dnst_spl_cnd_plot_pdf + - sc_rna_filter/all_plots_pdf valueFrom: $(self.flat().filter(n => n)) folder_basename: default: "pdf_plots" @@ -882,7 +852,7 @@ $schemas: label: "Single-Cell RNA-Seq Filtering Analysis" s:name: "Single-Cell RNA-Seq Filtering Analysis" -s:alternateName: "Filters single-cell RNA-Seq datasets based on the common QC metrics" +s:alternateName: "Single-Cell RNA-Seq Filtering Analysis" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-filter.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -922,7 +892,8 @@ s:creator: doc: | Single-Cell RNA-Seq Filtering Analysis - Removes low-quality cells from the outputs of “Cell Ranger Count (RNA)”, - “Cell Ranger Count (RNA+VDJ)”, and “Cell Ranger Aggregate (RNA, RNA+VDJ)” - pipelines. The results of this workflow are primarily used in “Single-Cell + Removes low-quality cells from the outputs of the “Cell + Ranger Count (RNA)”, “Cell Ranger Count (RNA+VDJ)”, and + “Cell Ranger Aggregate (RNA, RNA+VDJ)” pipelines. The + results of this workflow are used in the “Single-Cell RNA-Seq Dimensionality Reduction Analysis” pipeline. \ No newline at end of file From d554e1b3a265f59c1e05dc9941aec3f2af842716 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 1 Apr 2024 16:41:02 -0400 Subject: [PATCH 129/162] Update sc pipelines to export Loupe file --- tools/sc-atac-cluster.cwl | 2 +- tools/sc-atac-coverage.cwl | 2 +- tools/sc-atac-dbinding.cwl | 2 +- tools/sc-atac-filter.cwl | 2 +- tools/sc-atac-reduce.cwl | 2 +- tools/sc-ctype-assign.cwl | 21 ++++++++++++++++++++- tools/sc-multiome-filter.cwl | 21 ++++++++++++++++++++- tools/sc-rna-cluster.cwl | 20 +++++++++++++++++++- tools/sc-rna-da-cells.cwl | 20 +++++++++++++++++++- tools/sc-rna-de-pseudobulk.cwl | 2 +- tools/sc-rna-filter.cwl | 20 +++++++++++++++++++- tools/sc-rna-reduce.cwl | 20 +++++++++++++++++++- tools/sc-rna-trajectory.cwl | 20 +++++++++++++++++++- tools/sc-triangulate.cwl | 21 ++++++++++++++++++++- tools/sc-vdj-profile.cwl | 20 +++++++++++++++++++- tools/sc-wnn-cluster.cwl | 21 ++++++++++++++++++++- workflows/sc-ctype-assign.cwl | 12 ++++++++++++ workflows/sc-multiome-filter.cwl | 12 ++++++++++++ workflows/sc-rna-cluster.cwl | 11 +++++++++++ workflows/sc-rna-da-cells.cwl | 11 +++++++++++ workflows/sc-rna-filter.cwl | 11 +++++++++++ workflows/sc-rna-reduce.cwl | 11 +++++++++++ workflows/sc-rna-trajectory.cwl | 11 +++++++++++ workflows/sc-triangulate.cwl | 12 ++++++++++++ workflows/sc-vdj-profile.cwl | 11 +++++++++++ workflows/sc-wnn-cluster.cwl | 12 ++++++++++++ 26 files changed, 314 insertions(+), 16 deletions(-) diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index 931a7394..feabc77b 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.35 + dockerPull: biowardrobe2/sc-tools:v0.0.36 inputs: diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index e7a2e2b2..ede44980 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.35 + dockerPull: biowardrobe2/sc-tools:v0.0.36 inputs: diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index 8d97e2f5..67cef952 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.35 + dockerPull: biowardrobe2/sc-tools:v0.0.36 inputs: diff --git a/tools/sc-atac-filter.cwl b/tools/sc-atac-filter.cwl index c2b024d8..6ab24b2b 100644 --- a/tools/sc-atac-filter.cwl +++ b/tools/sc-atac-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.35 + dockerPull: biowardrobe2/sc-tools:v0.0.36 inputs: diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index 9401da93..a5e9da99 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.35 + dockerPull: biowardrobe2/sc-tools:v0.0.36 inputs: diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index 321a49df..b66470ca 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.35 + dockerPull: biowardrobe2/sc-tools:v0.0.36 inputs: @@ -279,6 +279,14 @@ inputs: Save raw counts from the RNA and/or ATAC assay(s) to h5ad file(s). Default: false + export_loupe_data: + type: boolean? + inputBinding: + prefix: "--loupe" + doc: | + Save raw counts from the RNA assay to Loupe file. + Default: false + export_scope_data: type: boolean? inputBinding: @@ -823,6 +831,15 @@ outputs: ATAC counts. H5AD format. + seurat_rna_data_cloupe: + type: File? + outputBinding: + glob: "*_rna_counts.cloupe" + doc: | + Seurat object. + RNA counts. + Loupe format + seurat_data_scope: type: File? outputBinding: @@ -1016,6 +1033,8 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA and/or ATAC assay(s) to h5ad file(s). Default: false + --loupe Save raw counts from the RNA assay to Loupe file. + Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --scope Save Seurat data to SCope compatible loom file. Only not normalized raw counts from the RNA assay will be diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index add80d42..7574bec4 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.35 + dockerPull: biowardrobe2/sc-tools:v0.0.36 inputs: @@ -406,6 +406,14 @@ inputs: Save raw counts from the RNA and ATAC assays to h5ad files. Default: false + export_loupe_data: + type: boolean? + inputBinding: + prefix: "--loupe" + doc: | + Save raw counts from the RNA assay to Loupe file. + Default: false + export_ucsc_cb: type: boolean? inputBinding: @@ -1303,6 +1311,15 @@ outputs: ATAC counts. H5AD format. + seurat_rna_data_cloupe: + type: File? + outputBinding: + glob: "*_rna_counts.cloupe" + doc: | + Seurat object. + RNA counts. + Loupe format + stdout_log: type: stdout @@ -1570,6 +1587,8 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA and ATAC assays to h5ad files. Default: false + --loupe Save raw counts from the RNA assay to Loupe file. + Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --tmpdir TMPDIR Directory to keep temporary files. Default: either /tmp or defined by environment variables TMPDIR, TMP, diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index bc4fae12..79154b83 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.35 + dockerPull: biowardrobe2/sc-tools:v0.0.36 inputs: @@ -202,6 +202,14 @@ inputs: Save raw counts from the RNA assay to h5ad file. Default: false + export_loupe_data: + type: boolean? + inputBinding: + prefix: "--loupe" + doc: | + Save raw counts from the RNA assay to Loupe file. + Default: false + export_scope_data: type: boolean? inputBinding: @@ -798,6 +806,14 @@ outputs: Seurat object. H5AD format + seurat_data_cloupe: + type: File? + outputBinding: + glob: "*_counts.cloupe" + doc: | + Seurat object. + Loupe format + seurat_data_scope: type: File? outputBinding: @@ -943,6 +959,8 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA assay to h5ad file. Default: false + --loupe Save raw counts from the RNA assay to Loupe file. + Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --scope Save Seurat data to SCope compatible loom file. Default: false diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index de4af447..44643254 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.35 + dockerPull: biowardrobe2/sc-tools:v0.0.36 inputs: @@ -166,6 +166,14 @@ inputs: Save raw counts from the RNA assay to h5ad file. Default: false + export_loupe_data: + type: boolean? + inputBinding: + prefix: "--loupe" + doc: | + Save raw counts from the RNA assay to Loupe file. + Default: false + export_ucsc_cb: type: boolean? inputBinding: @@ -471,6 +479,14 @@ outputs: Seurat object. H5AD format + seurat_data_cloupe: + type: File? + outputBinding: + glob: "*_counts.cloupe" + doc: | + Seurat object. + Loupe format + stdout_log: type: stdout @@ -604,6 +620,8 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA assay to h5ad file. Default: false + --loupe Save raw counts from the RNA assay to Loupe file. + Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --output OUTPUT Output prefix. Default: ./sc --theme {gray,bw,linedraw,light,dark,minimal,classic,void} diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index 51a1ba31..3bcdec5d 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.35 + dockerPull: biowardrobe2/sc-tools:v0.0.36 inputs: diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index 33d5506e..3045ccec 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -17,7 +17,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.35 + dockerPull: biowardrobe2/sc-tools:v0.0.36 inputs: @@ -225,6 +225,14 @@ inputs: Save raw counts from the RNA assay to h5ad file. Default: false + export_loupe_data: + type: boolean? + inputBinding: + prefix: "--loupe" + doc: | + Save raw counts from the RNA assay to Loupe file. + Default: false + export_ucsc_cb: type: boolean? inputBinding: @@ -615,6 +623,14 @@ outputs: Seurat object. H5AD format + seurat_data_cloupe: + type: File? + outputBinding: + glob: "*_counts.cloupe" + doc: | + Seurat object. + Loupe format + stdout_log: type: stdout @@ -804,6 +820,8 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA assay to h5ad file. Default: false + --loupe Save raw counts from the RNA assay to Loupe file. + Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --output OUTPUT Output prefix. Default: ./sc --theme {gray,bw,linedraw,light,dark,minimal,classic,void} diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index f28d4226..9944b191 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.35 + dockerPull: biowardrobe2/sc-tools:v0.0.36 inputs: @@ -317,6 +317,14 @@ inputs: Save raw counts from the RNA assay to h5ad file. Default: false + export_loupe_data: + type: boolean? + inputBinding: + prefix: "--loupe" + doc: | + Save raw counts from the RNA assay to Loupe file. + Default: false + export_scope_data: type: boolean? inputBinding: @@ -725,6 +733,14 @@ outputs: Seurat object. H5AD format + seurat_data_cloupe: + type: File? + outputBinding: + glob: "*_counts.cloupe" + doc: | + Seurat object. + Loupe format + seurat_data_scope: type: File? outputBinding: @@ -924,6 +940,8 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA assay to h5ad file. Default: false + --loupe Save raw counts from the RNA assay to Loupe file. + Default: false --scope Save Seurat data to SCope compatible loom file. Default: false --cbbuild Export results to UCSC Cell Browser. Default: false diff --git a/tools/sc-rna-trajectory.cwl b/tools/sc-rna-trajectory.cwl index 1b10e470..989cbbaa 100644 --- a/tools/sc-rna-trajectory.cwl +++ b/tools/sc-rna-trajectory.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.35 + dockerPull: biowardrobe2/sc-tools:v0.0.36 inputs: @@ -140,6 +140,14 @@ inputs: Save raw counts from the RNA assay to h5ad file. Default: false + export_loupe_data: + type: boolean? + inputBinding: + prefix: "--loupe" + doc: | + Save raw counts from the RNA assay to Loupe file. + Default: false + export_ucsc_cb: type: boolean? inputBinding: @@ -607,6 +615,14 @@ outputs: Seurat object. H5AD format + seurat_data_cloupe: + type: File? + outputBinding: + glob: "*_counts.cloupe" + doc: | + Seurat object. + Loupe format + stdout_log: type: stdout @@ -723,6 +739,8 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA assay to h5ad file. Default: false + --loupe Save raw counts from the RNA assay to Loupe file. + Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --output OUTPUT Output prefix. Default: ./sc --theme {gray,bw,linedraw,light,dark,minimal,classic,void} diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index c7f1e0a4..744d147d 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.35 + dockerPull: biowardrobe2/sc-tools:v0.0.36 inputs: @@ -106,6 +106,14 @@ inputs: Save raw counts from the RNA and/or ATAC assay(s) to h5ad file(s). Default: false + export_loupe_data: + type: boolean? + inputBinding: + prefix: "--loupe" + doc: | + Save raw counts from the RNA assay to Loupe file. + Default: false + export_ucsc_cb: type: boolean? inputBinding: @@ -355,6 +363,15 @@ outputs: ATAC counts. H5AD format. + seurat_rna_data_cloupe: + type: File? + outputBinding: + glob: "*_rna_counts.cloupe" + doc: | + Seurat object. + RNA counts. + Loupe format + stdout_log: type: stdout @@ -457,6 +474,8 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA and/or ATAC assay(s) to h5ad file(s). Default: false + --loupe Save raw counts from the RNA assay to Loupe file. + Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --output OUTPUT Output prefix. Default: ./sc --theme {gray,bw,linedraw,light,dark,minimal,classic,void} diff --git a/tools/sc-vdj-profile.cwl b/tools/sc-vdj-profile.cwl index ab72d050..48d83653 100644 --- a/tools/sc-vdj-profile.cwl +++ b/tools/sc-vdj-profile.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.35 + dockerPull: biowardrobe2/sc-tools:v0.0.36 inputs: @@ -165,6 +165,14 @@ inputs: Save raw counts from the RNA assay to h5ad file. Default: false + export_loupe_data: + type: boolean? + inputBinding: + prefix: "--loupe" + doc: | + Save raw counts from the RNA assay to Loupe file. + Default: false + export_scope_data: type: boolean? inputBinding: @@ -670,6 +678,14 @@ outputs: Seurat object. H5AD format + seurat_data_cloupe: + type: File? + outputBinding: + glob: "*_counts.cloupe" + doc: | + Seurat object. + Loupe format + seurat_data_scope: type: File? outputBinding: @@ -808,6 +824,8 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA assay to h5ad file. Default: false + --loupe Save raw counts from the RNA assay to Loupe file. + Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --scope Save Seurat data to SCope compatible loom file. Default: false diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index aa41b137..3c116365 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.35 + dockerPull: biowardrobe2/sc-tools:v0.0.36 inputs: @@ -356,6 +356,14 @@ inputs: Save raw counts from the RNA and ATAC assays to h5ad files. Default: false + export_loupe_data: + type: boolean? + inputBinding: + prefix: "--loupe" + doc: | + Save raw counts from the RNA assay to Loupe file. + Default: false + export_scope_data: type: boolean? inputBinding: @@ -974,6 +982,15 @@ outputs: ATAC counts. H5AD format. + seurat_rna_data_cloupe: + type: File? + outputBinding: + glob: "*_rna_counts.cloupe" + doc: | + Seurat object. + RNA counts. + Loupe format + seurat_data_scope: type: File? outputBinding: @@ -1191,6 +1208,8 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA and ATAC assays to h5ad files. Default: false + --loupe Save raw counts from the RNA assay to Loupe file. + Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --scope Save Seurat data to SCope compatible loom file. Only not normalized raw counts from the RNA assay will be diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index cb56a41f..aa53d09a 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -522,6 +522,15 @@ outputs: SCope compatible. Loom format. + seurat_rna_data_cloupe: + type: File? + outputSource: ctype_assign/seurat_rna_data_cloupe + label: "Seurat object in Loupe format" + doc: | + Seurat object. + RNA counts. + Loupe format. + pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder @@ -607,6 +616,8 @@ steps: default: true export_scope_data: default: true + export_loupe_data: + default: true export_pdf_plots: default: true color_theme: color_theme @@ -660,6 +671,7 @@ steps: - ucsc_cb_html_file - seurat_data_rds - seurat_data_scope + - seurat_rna_data_cloupe - stdout_log - stderr_log diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index 1b80f3e9..12e9c997 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -1228,6 +1228,15 @@ outputs: Example of datasets metadata file in TSV format + seurat_rna_data_cloupe: + type: File? + outputSource: sc_multiome_filter/seurat_rna_data_cloupe + label: "Seurat object in Loupe format" + doc: | + Seurat object. + RNA counts. + Loupe format. + pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder @@ -1339,6 +1348,8 @@ steps: default: true export_ucsc_cb: default: true + export_loupe_data: + default: true export_pdf_plots: default: true color_theme: color_theme @@ -1408,6 +1419,7 @@ steps: - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds + - seurat_rna_data_cloupe - datasets_metadata - stdout_log - stderr_log diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index 19e1dcaf..bee1d8ea 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -469,6 +469,14 @@ outputs: SCope compatible. Loom format. + seurat_data_cloupe: + type: File? + outputSource: sc_rna_cluster/seurat_data_cloupe + label: "Seurat object in Loupe format" + doc: | + Seurat object. + Loupe format. + pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder @@ -523,6 +531,8 @@ steps: default: true export_scope_data: default: true + export_loupe_data: + default: true export_pdf_plots: default: true color_theme: color_theme @@ -574,6 +584,7 @@ steps: - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds + - seurat_data_cloupe - seurat_data_scope - stdout_log - stderr_log diff --git a/workflows/sc-rna-da-cells.cwl b/workflows/sc-rna-da-cells.cwl index eb2d8786..643b5d18 100644 --- a/workflows/sc-rna-da-cells.cwl +++ b/workflows/sc-rna-da-cells.cwl @@ -321,6 +321,14 @@ outputs: Seurat object. RDS format. + seurat_data_cloupe: + type: File? + outputSource: da_cells/seurat_data_cloupe + label: "Seurat object in Loupe format" + doc: | + Seurat object. + Loupe format. + pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder @@ -364,6 +372,8 @@ steps: default: true export_ucsc_cb: default: true + export_loupe_data: + default: true export_pdf_plots: default: true color_theme: color_theme @@ -398,6 +408,7 @@ steps: - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds + - seurat_data_cloupe - stdout_log - stderr_log diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index 4aeef01f..4c5d0d44 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -712,6 +712,14 @@ outputs: Example of datasets metadata file in TSV format + seurat_data_cloupe: + type: File? + outputSource: sc_rna_filter/seurat_data_cloupe + label: "Seurat object in Loupe format" + doc: | + Seurat object. + Loupe format. + pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder @@ -775,6 +783,8 @@ steps: default: true export_ucsc_cb: default: true + export_loupe_data: + default: true export_pdf_plots: default: true color_theme: color_theme @@ -820,6 +830,7 @@ steps: - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds + - seurat_data_cloupe - datasets_metadata - stdout_log - stderr_log diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 7460ed65..3620e59a 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -530,6 +530,14 @@ outputs: Seurat object. RDS format. + seurat_data_cloupe: + type: File? + outputSource: sc_rna_reduce/seurat_data_cloupe + label: "Seurat object in Loupe format" + doc: | + Seurat object. + Loupe format. + pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder @@ -610,6 +618,8 @@ steps: export_ucsc_cb: export_ucsc_cb low_memory: default: true + export_loupe_data: + default: true export_pdf_plots: default: true color_theme: color_theme @@ -658,6 +668,7 @@ steps: - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds + - seurat_data_cloupe - stdout_log - stderr_log diff --git a/workflows/sc-rna-trajectory.cwl b/workflows/sc-rna-trajectory.cwl index 78236386..d1788e28 100644 --- a/workflows/sc-rna-trajectory.cwl +++ b/workflows/sc-rna-trajectory.cwl @@ -443,6 +443,14 @@ outputs: Seurat object. RDS format. + seurat_data_cloupe: + type: File? + outputSource: rna_trajectory/seurat_data_cloupe + label: "Seurat object in Loupe format" + doc: | + Seurat object. + Loupe format. + pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder @@ -489,6 +497,8 @@ steps: verbose: default: true export_ucsc_cb: export_ucsc_cb + export_loupe_data: + default: true export_pdf_plots: default: true color_theme: color_theme @@ -547,6 +557,7 @@ steps: - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds + - seurat_data_cloupe - stdout_log - stderr_log diff --git a/workflows/sc-triangulate.cwl b/workflows/sc-triangulate.cwl index 59bbc8df..039f1452 100644 --- a/workflows/sc-triangulate.cwl +++ b/workflows/sc-triangulate.cwl @@ -260,6 +260,15 @@ outputs: Seurat object. RDS format. + seurat_rna_data_cloupe: + type: File? + outputSource: triangulate/seurat_rna_data_cloupe + label: "Seurat object in Loupe format" + doc: | + Seurat object. + RNA counts. + Loupe format. + pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder @@ -296,6 +305,8 @@ steps: default: true export_ucsc_cb: default: true + export_loupe_data: + default: true export_pdf_plots: default: true color_theme: color_theme @@ -328,6 +339,7 @@ steps: - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds + - seurat_rna_data_cloupe - stdout_log - stderr_log diff --git a/workflows/sc-vdj-profile.cwl b/workflows/sc-vdj-profile.cwl index 31ad21cf..db2255c6 100644 --- a/workflows/sc-vdj-profile.cwl +++ b/workflows/sc-vdj-profile.cwl @@ -484,6 +484,14 @@ outputs: SCope compatible. Loom format. + seurat_data_cloupe: + type: File? + outputSource: vdj_profile/seurat_data_cloupe + label: "Seurat object in Loupe format" + doc: | + Seurat object. + Loupe format. + pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder @@ -523,6 +531,8 @@ steps: source: strictness valueFrom: $(self=="none"?null:self) color_theme: color_theme + export_loupe_data: + default: true export_pdf_plots: default: true verbose: @@ -584,6 +594,7 @@ steps: - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds + - seurat_data_cloupe - seurat_data_scope - stdout_log - stderr_log diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index 48a871f3..fbcef825 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -530,6 +530,15 @@ outputs: SCope compatible. Loom format. + seurat_rna_data_cloupe: + type: File? + outputSource: sc_wnn_cluster/seurat_rna_data_cloupe + label: "Seurat object in Loupe format" + doc: | + Seurat object. + RNA counts. + Loupe format. + pdf_plots: type: File outputSource: compress_pdf_plots/compressed_folder @@ -592,6 +601,8 @@ steps: default: true export_scope_data: default: true + export_loupe_data: + default: true export_pdf_plots: default: true color_theme: color_theme @@ -644,6 +655,7 @@ steps: - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds + - seurat_rna_data_cloupe - seurat_data_scope - stdout_log - stderr_log From c0ccaa1c615319aed49ba4789f49e16f8a156dd0 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 2 Apr 2024 15:49:49 -0400 Subject: [PATCH 130/162] Do not export to Loupe by default --- tools/sc-ctype-assign.cwl | 8 ++++++-- tools/sc-multiome-filter.cwl | 8 ++++++-- tools/sc-rna-cluster.cwl | 8 ++++++-- tools/sc-rna-da-cells.cwl | 8 ++++++-- tools/sc-rna-filter.cwl | 8 ++++++-- tools/sc-rna-reduce.cwl | 8 ++++++-- tools/sc-rna-trajectory.cwl | 8 ++++++-- tools/sc-triangulate.cwl | 8 ++++++-- tools/sc-vdj-profile.cwl | 8 ++++++-- tools/sc-wnn-cluster.cwl | 8 ++++++-- workflows/sc-ctype-assign.cwl | 15 +++++++++++++-- workflows/sc-multiome-filter.cwl | 15 +++++++++++++-- workflows/sc-rna-cluster.cwl | 15 +++++++++++++-- workflows/sc-rna-da-cells.cwl | 15 +++++++++++++-- workflows/sc-rna-filter.cwl | 15 +++++++++++++-- workflows/sc-rna-reduce.cwl | 15 +++++++++++++-- workflows/sc-rna-trajectory.cwl | 15 +++++++++++++-- workflows/sc-triangulate.cwl | 15 +++++++++++++-- workflows/sc-vdj-profile.cwl | 15 +++++++++++++-- workflows/sc-wnn-cluster.cwl | 15 +++++++++++++-- 20 files changed, 190 insertions(+), 40 deletions(-) diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index b66470ca..dccbe33b 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -284,7 +284,9 @@ inputs: inputBinding: prefix: "--loupe" doc: | - Save raw counts from the RNA assay to Loupe file. + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false export_scope_data: @@ -1033,7 +1035,9 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA and/or ATAC assay(s) to h5ad file(s). Default: false - --loupe Save raw counts from the RNA assay to Loupe file. + --loupe Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --scope Save Seurat data to SCope compatible loom file. Only diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index 7574bec4..b3a9ce62 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -411,7 +411,9 @@ inputs: inputBinding: prefix: "--loupe" doc: | - Save raw counts from the RNA assay to Loupe file. + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false export_ucsc_cb: @@ -1587,7 +1589,9 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA and ATAC assays to h5ad files. Default: false - --loupe Save raw counts from the RNA assay to Loupe file. + --loupe Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --tmpdir TMPDIR Directory to keep temporary files. Default: either diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index 79154b83..1f48b3cc 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -207,7 +207,9 @@ inputs: inputBinding: prefix: "--loupe" doc: | - Save raw counts from the RNA assay to Loupe file. + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false export_scope_data: @@ -959,7 +961,9 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA assay to h5ad file. Default: false - --loupe Save raw counts from the RNA assay to Loupe file. + --loupe Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --scope Save Seurat data to SCope compatible loom file. diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index 44643254..70e935ca 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -171,7 +171,9 @@ inputs: inputBinding: prefix: "--loupe" doc: | - Save raw counts from the RNA assay to Loupe file. + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false export_ucsc_cb: @@ -620,7 +622,9 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA assay to h5ad file. Default: false - --loupe Save raw counts from the RNA assay to Loupe file. + --loupe Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --output OUTPUT Output prefix. Default: ./sc diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index 3045ccec..e27c39be 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -230,7 +230,9 @@ inputs: inputBinding: prefix: "--loupe" doc: | - Save raw counts from the RNA assay to Loupe file. + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false export_ucsc_cb: @@ -820,7 +822,9 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA assay to h5ad file. Default: false - --loupe Save raw counts from the RNA assay to Loupe file. + --loupe Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --output OUTPUT Output prefix. Default: ./sc diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index 9944b191..9d3c7f7e 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -322,7 +322,9 @@ inputs: inputBinding: prefix: "--loupe" doc: | - Save raw counts from the RNA assay to Loupe file. + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false export_scope_data: @@ -940,7 +942,9 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA assay to h5ad file. Default: false - --loupe Save raw counts from the RNA assay to Loupe file. + --loupe Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false --scope Save Seurat data to SCope compatible loom file. Default: false diff --git a/tools/sc-rna-trajectory.cwl b/tools/sc-rna-trajectory.cwl index 989cbbaa..fd018e1a 100644 --- a/tools/sc-rna-trajectory.cwl +++ b/tools/sc-rna-trajectory.cwl @@ -145,7 +145,9 @@ inputs: inputBinding: prefix: "--loupe" doc: | - Save raw counts from the RNA assay to Loupe file. + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false export_ucsc_cb: @@ -739,7 +741,9 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA assay to h5ad file. Default: false - --loupe Save raw counts from the RNA assay to Loupe file. + --loupe Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --output OUTPUT Output prefix. Default: ./sc diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index 744d147d..edc162fb 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -111,7 +111,9 @@ inputs: inputBinding: prefix: "--loupe" doc: | - Save raw counts from the RNA assay to Loupe file. + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false export_ucsc_cb: @@ -474,7 +476,9 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA and/or ATAC assay(s) to h5ad file(s). Default: false - --loupe Save raw counts from the RNA assay to Loupe file. + --loupe Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --output OUTPUT Output prefix. Default: ./sc diff --git a/tools/sc-vdj-profile.cwl b/tools/sc-vdj-profile.cwl index 48d83653..1296f8dc 100644 --- a/tools/sc-vdj-profile.cwl +++ b/tools/sc-vdj-profile.cwl @@ -170,7 +170,9 @@ inputs: inputBinding: prefix: "--loupe" doc: | - Save raw counts from the RNA assay to Loupe file. + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false export_scope_data: @@ -824,7 +826,9 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA assay to h5ad file. Default: false - --loupe Save raw counts from the RNA assay to Loupe file. + --loupe Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --scope Save Seurat data to SCope compatible loom file. diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index 3c116365..b8e1cf6b 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -361,7 +361,9 @@ inputs: inputBinding: prefix: "--loupe" doc: | - Save raw counts from the RNA assay to Loupe file. + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false export_scope_data: @@ -1208,7 +1210,9 @@ s:about: | --h5seurat Save Seurat data to h5seurat file. Default: false --h5ad Save raw counts from the RNA and ATAC assays to h5ad files. Default: false - --loupe Save raw counts from the RNA assay to Loupe file. + --loupe Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --scope Save Seurat data to SCope compatible loom file. Only diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index aa53d09a..2d7ae677 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -176,6 +176,18 @@ inputs: The file should have two columns named 'cluster' and 'celltype'. + export_loupe_data: + type: boolean? + default: false + label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + doc: | + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. + Default: false + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -616,8 +628,7 @@ steps: default: true export_scope_data: default: true - export_loupe_data: - default: true + export_loupe_data: export_loupe_data export_pdf_plots: default: true color_theme: color_theme diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index 12e9c997..d103ff94 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -424,6 +424,18 @@ inputs: "sd:layout": advanced: true + export_loupe_data: + type: boolean? + default: false + label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + doc: | + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. + Default: false + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -1348,8 +1360,7 @@ steps: default: true export_ucsc_cb: default: true - export_loupe_data: - default: true + export_loupe_data: export_loupe_data export_pdf_plots: default: true color_theme: color_theme diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index bee1d8ea..ac9333ce 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -105,6 +105,18 @@ inputs: of interest to visualize expression. Default: None + export_loupe_data: + type: boolean? + default: false + label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + doc: | + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. + Default: false + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -531,8 +543,7 @@ steps: default: true export_scope_data: default: true - export_loupe_data: - default: true + export_loupe_data: export_loupe_data export_pdf_plots: default: true color_theme: color_theme diff --git a/workflows/sc-rna-da-cells.cwl b/workflows/sc-rna-da-cells.cwl index 643b5d18..a1ce99ad 100644 --- a/workflows/sc-rna-da-cells.cwl +++ b/workflows/sc-rna-da-cells.cwl @@ -104,6 +104,18 @@ inputs: present in the Seurat object metadata, they will be overwritten. Default: no extra metadata is added + export_loupe_data: + type: boolean? + default: false + label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + doc: | + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. + Default: false + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -372,8 +384,7 @@ steps: default: true export_ucsc_cb: default: true - export_loupe_data: - default: true + export_loupe_data: export_loupe_data export_pdf_plots: default: true color_theme: color_theme diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index 4c5d0d44..baa04ec6 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -236,6 +236,18 @@ inputs: "sd:layout": advanced: true + export_loupe_data: + type: boolean? + default: false + label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + doc: | + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. + Default: false + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -783,8 +795,7 @@ steps: default: true export_ucsc_cb: default: true - export_loupe_data: - default: true + export_loupe_data: export_loupe_data export_pdf_plots: default: true color_theme: color_theme diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 3620e59a..42f45ede 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -257,6 +257,18 @@ inputs: "sd:layout": advanced: true + export_loupe_data: + type: boolean? + default: false + label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + doc: | + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. + Default: false + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -618,8 +630,7 @@ steps: export_ucsc_cb: export_ucsc_cb low_memory: default: true - export_loupe_data: - default: true + export_loupe_data: export_loupe_data export_pdf_plots: default: true color_theme: color_theme diff --git a/workflows/sc-rna-trajectory.cwl b/workflows/sc-rna-trajectory.cwl index d1788e28..61d15c83 100644 --- a/workflows/sc-rna-trajectory.cwl +++ b/workflows/sc-rna-trajectory.cwl @@ -119,6 +119,18 @@ inputs: "sd:layout": advanced: true + export_loupe_data: + type: boolean? + default: false + label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + doc: | + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. + Default: false + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -497,8 +509,7 @@ steps: verbose: default: true export_ucsc_cb: export_ucsc_cb - export_loupe_data: - default: true + export_loupe_data: export_loupe_data export_pdf_plots: default: true color_theme: color_theme diff --git a/workflows/sc-triangulate.cwl b/workflows/sc-triangulate.cwl index 039f1452..636492d3 100644 --- a/workflows/sc-triangulate.cwl +++ b/workflows/sc-triangulate.cwl @@ -80,6 +80,18 @@ inputs: metadata ovewriting the existing ones if those are present. Default: all cells used, no extra metadata is added + export_loupe_data: + type: boolean? + default: false + label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + doc: | + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. + Default: false + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -305,8 +317,7 @@ steps: default: true export_ucsc_cb: default: true - export_loupe_data: - default: true + export_loupe_data: export_loupe_data export_pdf_plots: default: true color_theme: color_theme diff --git a/workflows/sc-vdj-profile.cwl b/workflows/sc-vdj-profile.cwl index db2255c6..9ac1fe1e 100644 --- a/workflows/sc-vdj-profile.cwl +++ b/workflows/sc-vdj-profile.cwl @@ -139,6 +139,18 @@ inputs: RNA-Seq Datasets" and can be utilized in the current or future steps of analysis. + export_loupe_data: + type: boolean? + default: false + label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + doc: | + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. + Default: false + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -531,8 +543,7 @@ steps: source: strictness valueFrom: $(self=="none"?null:self) color_theme: color_theme - export_loupe_data: - default: true + export_loupe_data: export_loupe_data export_pdf_plots: default: true verbose: diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index fbcef825..599988ee 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -154,6 +154,18 @@ inputs: plots will be created as well. Default: None + export_loupe_data: + type: boolean? + default: false + label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + doc: | + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. + Default: false + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -601,8 +613,7 @@ steps: default: true export_scope_data: default: true - export_loupe_data: - default: true + export_loupe_data: export_loupe_data export_pdf_plots: default: true color_theme: color_theme From bb7a64f24130b768c35b35d4f812fe45b114ab47 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 12 Apr 2024 16:56:16 -0400 Subject: [PATCH 131/162] Add SoupOrCell pipeline for Cell Ranger Count and Multi outputs --- workflows/cellranger-multi.cwl | 8 +- workflows/souporcell-rna.cwl | 343 +++++++++++++++++++++++++++++++++ 2 files changed, 347 insertions(+), 4 deletions(-) create mode 100644 workflows/souporcell-rna.cwl diff --git a/workflows/cellranger-multi.cwl b/workflows/cellranger-multi.cwl index 073403e6..35de8b4d 100644 --- a/workflows/cellranger-multi.cwl +++ b/workflows/cellranger-multi.cwl @@ -10,7 +10,7 @@ requirements: "sd:upstream": - gex_indices: + genome_indices: - "cellranger-mkref.cwl" vdj_indices: - "cellranger-mkvdjref.cwl" @@ -35,13 +35,13 @@ inputs: This sample can be obtained from "Cell Ranger Reference (RNA, ATAC, RNA+ATAC)" pipeline. - "sd:upstreamSource": "gex_indices/indices_folder" + "sd:upstreamSource": "genome_indices/indices_folder" "sd:localLabel": true memory_limit: type: int? default: 20 - "sd:upstreamSource": "gex_indices/memory_limit" + "sd:upstreamSource": "genome_indices/memory_limit" vdj_indices_folder: type: Directory @@ -123,7 +123,7 @@ inputs: - "4" - "5" - "6" - default: "4" + default: "6" label: "Cores/CPUs" doc: | Parallelization parameter to define the diff --git a/workflows/souporcell-rna.cwl b/workflows/souporcell-rna.cwl new file mode 100644 index 00000000..a8646d08 --- /dev/null +++ b/workflows/souporcell-rna.cwl @@ -0,0 +1,343 @@ +cwlVersion: v1.1 +class: Workflow + + +requirements: +- class: SubworkflowFeatureRequirement +- class: StepInputExpressionRequirement +- class: MultipleInputFeatureRequirement +- class: InlineJavascriptRequirement + expressionLib: + - var split_by_common_delim = function(line) { + function get_unique(value, index, self) { + return self.indexOf(value) === index && value != ""; + } + let splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + + +"sd:upstream": + sc_rnaseq_sample: + - "cellranger-multi.cwl" + - "single-cell-preprocess-cellranger.cwl" + + +inputs: + + alias: + type: string + label: "Analysis name" + sd:preview: + position: 1 + + possorted_genome_bam_bai: + type: File + secondaryFiles: + - .bai + label: "Cell Ranger Count RNA or RNA+VDJ Sample" + doc: | + Any Cell Ranger Count (RNA) or + Cell Ranger Count (RNA+VDJ) Sample + "sd:upstreamSource": "sc_rnaseq_sample/possorted_genome_bam_bai" + "sd:localLabel": true + + genome_fasta_file: + type: File + secondaryFiles: + - .fai + label: "Cell Ranger Count RNA or RNA+VDJ Sample" + "sd:upstreamSource": "sc_rnaseq_sample/genome_indices/genome_indices/fasta_output" + + filtered_feature_bc_matrix_folder: + type: File + label: "Cell Ranger Count RNA or RNA+VDJ Sample" + "sd:upstreamSource": "sc_rnaseq_sample/filtered_feature_bc_matrix_folder" + + clusters_count: + type: int + label: "Number of clusters to detect (number of donors merged into one single-cell experiment)" + doc: | + Number of clusters to detect (number of donors merged into one single-cell experiment) + + barcodes_data: + type: File? + label: "Selected cell barcodes (optional)" + doc: | + A TSV/CSV file to optionally prefilter + the single cell data by including only + the cells with the selected barcodes. + The provided file should have one cell + barcode per line and do not include any + header information. + + ploidy_count: + type: int? + default: 2 + label: "Ploidy, must be 1 or 2" + doc: | + Ploidy, must be 1 or 2 + "sd:layout": + advanced: true + + min_alt: + type: int? + default: 10 + label: "Min alt to use locus" + doc: | + Min alt to use locus + "sd:layout": + advanced: true + + min_ref: + type: int? + default: 10 + label: "Min ref to use locus" + doc: | + Min ref to use locus + "sd:layout": + advanced: true + + max_loci: + type: int? + default: 2048 + label: "Max loci per cell, affects speed" + doc: | + Max loci per cell, affects speed + "sd:layout": + advanced: true + + restarts_count: + type: int? + default: 100 + label: "Number of restarts in clustering, when there are > 12 clusters we recommend increasing this to avoid local minima" + doc: | + Number of restarts in clustering, when there are > 12 + clusters we recommend increasing this to avoid local + minima + "sd:layout": + advanced: true + + known_genotypes_sample_names: + type: string? + label: "Which samples in population VCF from known genotypes option represent the donors in your sample" + doc: | + Which samples in population VCF from known genotypes + option represent the donors in your sample + "sd:layout": + advanced: true + + skip_remap: + type: boolean? + default: false + label: "Don't remap with minimap2 (not recommended unless Common variant loci VCF file was provided)" + doc: | + Don't remap with minimap2 (not recommended unless in + conjunction with --common_variants) + "sd:layout": + advanced: true + + ignore_data_errors: + type: boolean? + label: "Ignore data error assertions" + doc: | + Set to True to ignore data error assertions + "sd:layout": + advanced: true + + threads: + type: int? + default: 2 + label: "Threads number to use" + doc: | + Threads number + "sd:layout": + advanced: true + + common_variants_vcf_file: + type: File? + label: "Common variant loci or known variant loci VCF file" + doc: | + Common variant loci or known variant loci VCF file, + must be made vs the same reference fasta + "sd:layout": + advanced: true + + known_genotypes_vcf_file: + type: File? + label: "Known variants per clone in population VCF file" + doc: | + Known variants per clone in population VCF mode, must be .vcf + "sd:layout": + advanced: true + + +outputs: + + genotype_cluster_tsv_file: + type: File + outputSource: rna_souporcell/genotype_cluster_tsv_file + label: "Cellurar barcodes file clustered by genotype" + doc: | + Cellurar barcodes file clustered by genotype + "sd:visualPlugins": + - syncfusiongrid: + tab: "Genotypes" + Title: "Cells clustered by genotype" + + genotype_cluster_vcf_file: + type: File + outputSource: rna_souporcell/genotype_cluster_vcf_file + label: "VCF file with genotypes for each cluster for each variant call" + doc: | + VCF file with genotypes for each cluster for each variant call. + Refer to http://software.broadinstitute.org/software/igv/viewing_vcf_files + for track description when displaying in IGV. + + ambient_rna_file: + type: File + outputSource: rna_souporcell/ambient_rna_file + label: "Ambient RNA evaluation text file" + doc: | + Ambient RNA evaluation text file + + rna_souporcell_stdout_log: + type: File + outputSource: rna_souporcell/stdout_log + label: stdout log generated by souporcell + doc: | + stdout log generated by souporcell + + rna_souporcell_stderr_log: + type: File + outputSource: rna_souporcell/stderr_log + label: stderr log generated by souporcell + doc: | + stderr log generated by souporcell + + +steps: + + get_barcodes_tsv_file: + run: + cwlVersion: v1.1 + class: CommandLineTool + hints: + - class: DockerRequirement + dockerPull: biowardrobe2/scidap:v0.0.3 + inputs: + script: + type: string? + default: | + #!/bin/bash + tar xzf $0 + mv filtered_feature_bc_matrix/barcodes.tsv.gz . + gunzip barcodes.tsv.gz + if [ -f "$1" ]; then + echo "Filter by user provided barcodes" + comm -12 --check-order <(sort barcodes.tsv) <(sort $1) > cell_barcodes.tsv + else + echo "Do not filter by user provided barcodes" + mv barcodes.tsv cell_barcodes.tsv + fi + inputBinding: + position: 5 + filtered_feature_bc_matrix_folder: + type: File + inputBinding: + position: 6 + barcodes_data: + type: File? + inputBinding: + position: 7 + outputs: + barcodes_tsv_file: + type: File + outputBinding: + glob: "cell_barcodes.tsv" + baseCommand: ["bash", "-c"] + in: + filtered_feature_bc_matrix_folder: filtered_feature_bc_matrix_folder + barcodes_data: barcodes_data + out: + - barcodes_tsv_file + + rna_souporcell: + run: ../tools/souporcell.cwl + in: + possorted_genome_bam_bai: possorted_genome_bam_bai + barcodes_tsv_file: get_barcodes_tsv_file/barcodes_tsv_file + genome_fasta_file: genome_fasta_file + clusters_count: clusters_count + ploidy_count: ploidy_count + min_alt: min_alt + min_ref: min_ref + max_loci: max_loci + restarts_count: restarts_count + common_variants_vcf_file: common_variants_vcf_file + known_genotypes_vcf_file: known_genotypes_vcf_file + known_genotypes_sample_names: + source: known_genotypes_sample_names + valueFrom: $(split_by_common_delim(self)) + skip_remap: skip_remap + ignore_data_errors: ignore_data_errors + threads: threads + out: + - genotype_cluster_tsv_file + - genotype_cluster_vcf_file + - ambient_rna_file + - stdout_log + - stderr_log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Souporcell Cluster by Genotype for RNA" +s:name: "Souporcell Cluster by Genotype for RNA" +s:alternateName: "Souporcell Cluster by Genotype for RNA" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/souporcell-rna.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Souporcell Cluster by Genotype for RNA + + Souporcell: robust clustering of single-cell data by + genotype without reference genotypes \ No newline at end of file From 666eb95a3b8e32434c5cb64cb83352f325351861 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 15 Apr 2024 12:02:26 -0400 Subject: [PATCH 132/162] Fix bug in souporcell RNA to work with VDJ outputs --- workflows/souporcell-rna.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/souporcell-rna.cwl b/workflows/souporcell-rna.cwl index a8646d08..b1b07ee9 100644 --- a/workflows/souporcell-rna.cwl +++ b/workflows/souporcell-rna.cwl @@ -231,7 +231,7 @@ steps: default: | #!/bin/bash tar xzf $0 - mv filtered_feature_bc_matrix/barcodes.tsv.gz . + mv */barcodes.tsv.gz . gunzip barcodes.tsv.gz if [ -f "$1" ]; then echo "Filter by user provided barcodes" From ba9a746dc2dd2f857dc8e68916281179af411390 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 23 Apr 2024 16:28:58 -0400 Subject: [PATCH 133/162] Update V(D)J pipeline, update docker image to the latest --- tools/sc-atac-cluster.cwl | 2 +- tools/sc-atac-coverage.cwl | 2 +- tools/sc-atac-dbinding.cwl | 2 +- tools/sc-atac-filter.cwl | 39 +-- tools/sc-atac-reduce.cwl | 2 +- tools/sc-ctype-assign.cwl | 2 +- tools/sc-multiome-filter.cwl | 45 +-- tools/sc-rna-cluster.cwl | 2 +- tools/sc-rna-da-cells.cwl | 2 +- tools/sc-rna-de-pseudobulk.cwl | 2 +- tools/sc-rna-filter.cwl | 45 +-- tools/sc-rna-reduce.cwl | 2 +- tools/sc-rna-trajectory.cwl | 2 +- tools/sc-triangulate.cwl | 2 +- tools/sc-vdj-profile.cwl | 554 ++++++++++++-------------------- tools/sc-wnn-cluster.cwl | 2 +- workflows/sc-vdj-profile.cwl | 565 +++++++++++++++++---------------- 17 files changed, 549 insertions(+), 723 deletions(-) diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index feabc77b..56550096 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.36 + dockerPull: biowardrobe2/sc-tools:v0.0.37 inputs: diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index ede44980..799743b4 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.36 + dockerPull: biowardrobe2/sc-tools:v0.0.37 inputs: diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index 67cef952..2de29ade 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.36 + dockerPull: biowardrobe2/sc-tools:v0.0.37 inputs: diff --git a/tools/sc-atac-filter.cwl b/tools/sc-atac-filter.cwl index 6ab24b2b..288dc0f4 100644 --- a/tools/sc-atac-filter.cwl +++ b/tools/sc-atac-filter.cwl @@ -4,12 +4,6 @@ class: CommandLineTool requirements: - class: InlineJavascriptRequirement -- class: InitialWorkDirRequirement - listing: - - entryname: dummy_metadata.csv - entry: | - library_id - Experiment - class: EnvVarRequirement envDef: R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) @@ -17,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.36 + dockerPull: biowardrobe2/sc-tools:v0.0.37 inputs: @@ -34,12 +28,12 @@ inputs: aggregation_metadata: type: File? + inputBinding: + prefix: "--identity" doc: | - Path to the metadata TSV/CSV file to set the datasets identities. If --mex points - to the Cell Ranger Aggregate (ATAC) or Cell Ranger Aggregate (RNA+ATAC) outputs, - the aggr.csv file can be used. If Cell Ranger Count (ATAC) or Cell Ranger Count - (RNA+ATAC) outputs have been used in the --mex input, the file should include at - least one column - library_id and one row with the alias for that experiment. + Path to the metadata TSV/CSV file to set the datasets identities, if --mex points + to the Cell Ranger Aggregate (ATAC) or Cell Ranger Aggregate (RNA+ATAC) outputs. + The aggr.csv file can be used. atac_fragments_file: type: File @@ -825,17 +819,6 @@ outputs: baseCommand: ["sc_atac_filter.R"] -arguments: -- valueFrom: | - ${ - if (inputs.aggregation_metadata) { - return inputs.aggregation_metadata; - } else { - return runtime.outdir + "/dummy_metadata.csv" - } - } - prefix: "--identity" - stdout: sc_atac_filter_stdout.log stderr: sc_atac_filter_stderr.log @@ -898,7 +881,7 @@ doc: | s:about: | - usage: sc_atac_filter.R [-h] --mex MEX --identity IDENTITY + usage: sc_atac_filter.R [-h] --mex MEX [--identity IDENTITY] --fragments FRAGMENTS --annotations ANNOTATIONS --seqinfo SEQINFO [--grouping GROUPING] @@ -931,13 +914,9 @@ s:about: | For RNA+ATAC experiments the rows consisting genes will be ignored. --identity IDENTITY Path to the metadata TSV/CSV file to set the datasets - identities. If --mex points to the Cell Ranger + identities, if --mex points to the Cell Ranger Aggregate (ATAC) or Cell Ranger Aggregate (RNA+ATAC) - outputs, the aggr.csv file can be used. If Cell Ranger - Count (ATAC) or Cell Ranger Count (RNA+ATAC) outputs - have been used in the --mex input, the file should - include at least one column - library_id and one row - with the alias for that experiment. + outputs. The aggr.csv file can be used. --fragments FRAGMENTS Count and barcode information for every ATAC fragment observed in the experiment in TSV format. Tbi-index diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index a5e9da99..7018d905 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.36 + dockerPull: biowardrobe2/sc-tools:v0.0.37 inputs: diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index dccbe33b..b2a7281c 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.36 + dockerPull: biowardrobe2/sc-tools:v0.0.37 inputs: diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index b3a9ce62..fe7c4732 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -4,12 +4,6 @@ class: CommandLineTool requirements: - class: InlineJavascriptRequirement -- class: InitialWorkDirRequirement - listing: - - entryname: dummy_metadata.csv - entry: | - library_id - Experiment - class: EnvVarRequirement envDef: R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) @@ -17,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.36 + dockerPull: biowardrobe2/sc-tools:v0.0.37 inputs: @@ -33,10 +27,11 @@ inputs: aggregation_metadata: type: File? + inputBinding: + prefix: "--identity" doc: | - Path to the metadata TSV/CSV file to set the datasets identities. If '--mex' points to - the Cell Ranger ARC Aggregate outputs, the aggr.csv file can be used. If input is not - provided, the default dummy_metadata.csv will be used instead. + Path to the metadata TSV/CSV file to set the datasets identities, if '--mex' points to + the Cell Ranger ARC Aggregate outputs. The aggr.csv file can be used. atac_fragments_file: type: File @@ -1330,17 +1325,6 @@ outputs: baseCommand: ["sc_multiome_filter.R"] -arguments: -- valueFrom: | - ${ - if (inputs.aggregation_metadata) { - return inputs.aggregation_metadata; - } else { - return runtime.outdir + "/dummy_metadata.csv" - } - } - prefix: "--identity" - stdout: sc_multiome_filter_stdout.log stderr: sc_multiome_filter_stderr.log @@ -1403,9 +1387,10 @@ doc: | s:about: | - usage: sc_multiome_filter.R [-h] --mex MEX --identity IDENTITY - --fragments FRAGMENTS --annotations - ANNOTATIONS --seqinfo SEQINFO + usage: sc_multiome_filter.R [-h] --mex MEX + [--identity IDENTITY] --fragments + FRAGMENTS --annotations ANNOTATIONS + --seqinfo SEQINFO [--grouping GROUPING] [--blacklist BLACKLIST] [--barcodes BARCODES] @@ -1429,8 +1414,8 @@ s:about: | [--atacdbr ATACDBR] [--atacdbrsd ATACDBRSD] [--pdf] [--verbose] [--h5seurat] [--h5ad] - [--cbbuild] [--tmpdir TMPDIR] - [--output OUTPUT] + [--loupe] [--cbbuild] + [--tmpdir TMPDIR] [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] [--seed SEED] @@ -1445,12 +1430,8 @@ s:about: | concatenated together and the columns are restricted to those barcodes that are identified as cells. --identity IDENTITY Path to the metadata TSV/CSV file to set the datasets - identities. If '--mex' points to the Cell Ranger ARC - Aggregate outputs, the aggr.csv file can be used. If - Cell Ranger ARC Count outputs have been used in the ' - --mex' input, the file should include at least one - column - 'library_id' and one row with the alias for - Cell Ranger ARC Count experiment. + identities, if '--mex' points to the Cell Ranger ARC + Aggregate outputs. The aggr.csv file can be used. --fragments FRAGMENTS Count and barcode information for every ATAC fragment observed in the experiment in TSV format. Tbi-index diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index 1f48b3cc..f0fb946f 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.36 + dockerPull: biowardrobe2/sc-tools:v0.0.37 inputs: diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index 70e935ca..33d130e5 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.36 + dockerPull: biowardrobe2/sc-tools:v0.0.37 inputs: diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index 3bcdec5d..0be3a3c4 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.36 + dockerPull: biowardrobe2/sc-tools:v0.0.37 inputs: diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index e27c39be..f304eaf5 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -4,12 +4,6 @@ class: CommandLineTool requirements: - class: InlineJavascriptRequirement -- class: InitialWorkDirRequirement - listing: - - entryname: dummy_metadata.csv - entry: | - library_id - Experiment - class: EnvVarRequirement envDef: R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) @@ -17,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.36 + dockerPull: biowardrobe2/sc-tools:v0.0.37 inputs: @@ -32,10 +26,15 @@ inputs: aggregation_metadata: type: File? + inputBinding: + prefix: "--identity" doc: | - Path to the metadata TSV/CSV file to set the datasets identities. If '--mex' points to - the Cell Ranger Aggregate outputs, the aggregation.csv file can be used. If input is not - provided, the default dummy_metadata.csv will be used instead. + Path to the metadata TSV/CSV file to set the datasets identities, if '--mex' points to + the Cell Ranger Aggregate outputs. The aggregation.csv file can be used. In case of + using feature-barcode matrices the file with identities should include at least one + column - 'library_id', and a row with aliases per each experiment from the '--mex' + input. The order of rows should correspond to the order of feature-barcode matrices + provided in the '--mex' parameter. grouping_data: type: File? @@ -641,17 +640,6 @@ outputs: baseCommand: ["sc_rna_filter.R"] -arguments: -- valueFrom: | - ${ - if (inputs.aggregation_metadata) { - return inputs.aggregation_metadata; - } else { - return runtime.outdir + "/dummy_metadata.csv" - } - } - prefix: "--identity" - stdout: sc_rna_filter_stdout.log stderr: sc_rna_filter_stderr.log @@ -714,8 +702,9 @@ doc: | s:about: | - usage: sc_rna_filter.R [-h] --mex MEX [MEX ...] --identity - IDENTITY [--grouping GROUPING] + usage: sc_rna_filter.R [-h] --mex MEX [MEX ...] + [--identity IDENTITY] + [--grouping GROUPING] [--barcodes BARCODES] [--rnamincells RNAMINCELLS] [--mingenes [MINGENES [MINGENES ...]]] @@ -726,7 +715,8 @@ s:about: | [--maxmt MAXMT] [--removedoublets] [--rnadbr RNADBR] [--rnadbrsd RNADBRSD] [--pdf] [--verbose] [--h5seurat] - [--h5ad] [--cbbuild] [--output OUTPUT] + [--h5ad] [--loupe] [--cbbuild] + [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] [--seed SEED] @@ -742,10 +732,9 @@ s:about: | Count experiments) and will be merged before the analysis. --identity IDENTITY Path to the metadata TSV/CSV file to set the datasets - identities. If '--mex' points to the Cell Ranger - Aggregate outputs, the aggregation.csv file can be - used. In case of using feature-barcode matrices from a - single or multiple Cell Ranger Count experiments the + identities, if '--mex' points to the Cell Ranger + Aggregate outputs. The aggregation.csv file can be + used. In case of using feature-barcode matrices the file with identities should include at least one column - 'library_id', and a row with aliases per each experiment from the '--mex' input. The order of rows diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index 9d3c7f7e..316358ad 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.36 + dockerPull: biowardrobe2/sc-tools:v0.0.37 inputs: diff --git a/tools/sc-rna-trajectory.cwl b/tools/sc-rna-trajectory.cwl index fd018e1a..0c8f3189 100644 --- a/tools/sc-rna-trajectory.cwl +++ b/tools/sc-rna-trajectory.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.36 + dockerPull: biowardrobe2/sc-tools:v0.0.37 inputs: diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index edc162fb..72f22d41 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.36 + dockerPull: biowardrobe2/sc-tools:v0.0.37 inputs: diff --git a/tools/sc-vdj-profile.cwl b/tools/sc-vdj-profile.cwl index 1296f8dc..d5b541c0 100644 --- a/tools/sc-vdj-profile.cwl +++ b/tools/sc-vdj-profile.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.36 + dockerPull: biowardrobe2/sc-tools:v0.0.37 inputs: @@ -23,8 +23,10 @@ inputs: doc: | Path to the RDS file to load Seurat object from. This file should include gene expression information stored - in the RNA assay, as well as 'pca' and 'rnaumap' - dimensionality reductions applied to that assay. + in the RNA assay, as well as pca and rnaumap dimensionality + reductions applied to that assay. If loaded Seurat object + includes multiple datasets, it should have a donor column + to define grouping for clonotype calling. contigs_data: type: File @@ -64,14 +66,6 @@ inputs: if those are present. Default: all cells used, no extra metadata is added - query_source_column: - type: string - inputBinding: - prefix: "--source" - doc: | - Column from the metadata of the loaded Seurat - object to select clusters from. - cloneby: type: - "null" @@ -89,29 +83,37 @@ inputs: the amino acid sequence. strict: based on the combination of the nucleotide and gene sequences. Default: gene - groupby: - type: string? + minimum_frequency: + type: int? inputBinding: - prefix: "--groupby" + prefix: "--minfrequency" doc: | - Column from the metadata of the loaded Seurat object - to group cells for clonotype frequency calculation. - Default: group by dataset + Minimum frequency (number of cells) per + clonotype to be reported. + Default: 3 - strictness: + filterby: type: - "null" - type: enum symbols: - - "removemulti" - - "filtermulti" + - "cells" + - "chains" + inputBinding: + prefix: "--filter" + doc: | + Stringency filters to be applied. 1) cells: remove + cells with more than 2 chains. 2) chains: remove + chains exceeding 2 (select the most expressed ones). + Default: do not apply any filters. + + remove_partial: + type: boolean? inputBinding: - prefix: "--strictness" + prefix: "--removepartial" doc: | - Apply stringency filters. Removemulti: remove any cell - with more than 2 immune receptor chains. Filtermulti: - isolate the top 2 expressed chains in cell with multiple - chains. Default: do not apply any filters + Remove cells with only one chain detected. + Default: keep all cells if at least one chain detected export_pdf_plots: type: boolean? @@ -237,403 +239,247 @@ inputs: outputs: - count_spl_idnt_plot_png: + cl_qnt_gr_idnt_spl_ch_plot_png: type: File? outputBinding: - glob: "*_count_spl_idnt.png" + glob: "*_cl_qnt_gr_idnt_spl_ch.png" doc: | - Unique clonotypes, - split by dataset - PNG format + Percentage of unique clonotypes per dataset. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. - count_spl_idnt_plot_pdf: + cl_dnst_gr_idnt_spl_ch_plot_png: type: File? outputBinding: - glob: "*_count_spl_idnt.pdf" + glob: "*_cl_dnst_gr_idnt_spl_ch.png" doc: | - Unique clonotypes, - split by dataset - PDF format + Distribution of clonotype frequencies per dataset. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. - count_spl_clst_plot_png: + allu_gr_idnt_spl_ch_plot_png: type: File? outputBinding: - glob: "*_count_spl_clst.png" + glob: "*_allu_gr_idnt_spl_ch.png" doc: | - Unique clonotypes, - split by cluster - PNG format + Proportion of top shared clonotypes between datasets. + Split by chain; filtered by minimum clonotype + frequency per donor; top clonotypes selected from + each dataset. + PNG format. - count_spl_clst_plot_pdf: + hmst_gr_idnt_spl_ch_plot_png: type: File? outputBinding: - glob: "*_count_spl_clst.pdf" + glob: "*_hmst_gr_idnt_spl_ch.png" doc: | - Unique clonotypes, - split by cluster - PDF format + Proportion of clonotype frequencies per dataset. + Split by chain; not filtered by clonotype frequency. + PNG format. - hmst_spl_idnt_plot_png: + vrlp_gr_idnt_spl_ch_plot_png: type: File? outputBinding: - glob: "*_hmst_spl_idnt.png" + glob: "*_vrlp_gr_idnt_spl_ch.png" doc: | - Clonal space homeostasis, - split by dataset - PNG format + Overlap of clonotypes between datasets. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. - hmst_spl_idnt_plot_pdf: + dvrs_gr_idnt_spl_ch_plot_png: type: File? outputBinding: - glob: "*_hmst_spl_idnt.pdf" + glob: "*_dvrs_gr_idnt_spl_ch.png" doc: | - Clonal space homeostasis, - split by dataset - PDF format + Diversity of clonotypes per dataset. + Split by chain; not filtered by clonotype frequency. + PNG format. - hmst_spl_clst_plot_png: + gene_gr_idnt_spl_ch_plot_png: type: File? outputBinding: - glob: "*_hmst_spl_clst.png" + glob: "*_gene_gr_idnt_spl_ch.png" doc: | - Clonal space homeostasis, - split by cluster - PNG format + Distribution of gene usage per dataset. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. - hmst_spl_clst_plot_pdf: + umap_cl_freq_spl_ch_plot_png: type: File? outputBinding: - glob: "*_hmst_spl_clst.pdf" + glob: "*_umap_cl_freq_spl_ch.png" doc: | - Clonal space homeostasis, - split by cluster - PDF format + UMAP colored by clonotype frequency. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. - vrlp_spl_clst_plot_png: + cl_qnt_gr_dnr_spl_ch_plot_png: type: File? outputBinding: - glob: "*_vrlp_spl_clst.png" + glob: "*_cl_qnt_gr_dnr_spl_ch.png" doc: | - Clonotypes similarity, - split by cluster - PNG format + Percentage of unique clonotypes per donor. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. - vrlp_spl_clst_plot_pdf: + cl_dnst_gr_dnr_spl_ch_plot_png: type: File? outputBinding: - glob: "*_vrlp_spl_clst.pdf" + glob: "*_cl_dnst_gr_dnr_spl_ch.png" doc: | - Clonotypes similarity, - split by cluster - PDF format + Distribution of clonotype frequencies per donor. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. - vrlp_spl_idnt_plot_png: + allu_gr_dnr_spl_ch_plot_png: type: File? outputBinding: - glob: "*_vrlp_spl_idnt.png" + glob: "*_allu_gr_dnr_spl_ch.png" doc: | - Clonotypes similarity, - split by dataset - PNG format + Proportion of top shared clonotypes between donors. + Split by chain; filtered by minimum clonotype + frequency per donor; top clonotypes selected from + each donor. + PNG format. - vrlp_spl_idnt_plot_pdf: + hmst_gr_dnr_spl_ch_plot_png: type: File? outputBinding: - glob: "*_vrlp_spl_idnt.pdf" + glob: "*_hmst_gr_dnr_spl_ch.png" doc: | - Clonotypes similarity, - split by dataset - PDF format + Proportion of clonotype frequencies per donor. + Split by chain; not filtered by clonotype frequency. + PNG format. - ntwr_gr_clst_plot_png: + vrlp_gr_dnr_spl_ch_plot_png: type: File? outputBinding: - glob: "*_ntwr_gr_clst.png" + glob: "*_vrlp_gr_dnr_spl_ch.png" doc: | - Clonotypes network, - colored by cluster - PNG format + Overlap of clonotypes between donors. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. - ntwr_gr_clst_plot_pdf: + dvrs_gr_dnr_spl_ch_plot_png: type: File? outputBinding: - glob: "*_ntwr_gr_clst.pdf" + glob: "*_dvrs_gr_dnr_spl_ch.png" doc: | - Clonotypes network, - colored by cluster - PDF format + Diversity of clonotypes per donor. + Split by chain; not filtered by clonotype frequency. + PNG format. - ntwr_gr_idnt_plot_png: + gene_gr_dnr_spl_ch_plot_png: type: File? outputBinding: - glob: "*_ntwr_gr_idnt.png" + glob: "*_gene_gr_dnr_spl_ch.png" doc: | - Clonotypes network, - colored by dataset - PNG format + Distribution of gene usage per donor. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. - ntwr_gr_idnt_plot_pdf: + cl_qnt_gr_cnd_spl_ch_plot_png: type: File? outputBinding: - glob: "*_ntwr_gr_idnt.pdf" + glob: "*_cl_qnt_gr_cnd_spl_ch.png" doc: | - Clonotypes network, - colored by dataset - PDF format + Percentage of unique clonotypes per + grouping condition. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. - dvrs_gr_clst_spl_idnt_plot_png: + cl_dnst_gr_cnd_spl_ch_plot_png: type: File? outputBinding: - glob: "*_dvrs_gr_clst_spl_idnt.png" + glob: "*_cl_dnst_gr_cnd_spl_ch.png" doc: | - Clonotypes diversity, - colored by cluster, - split by dataset - PNG format + Distribution of clonotype frequencies + per grouping condition. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. - dvrs_gr_clst_spl_idnt_plot_pdf: + allu_gr_cnd_spl_ch_plot_png: type: File? outputBinding: - glob: "*_dvrs_gr_clst_spl_idnt.pdf" + glob: "*_allu_gr_cnd_spl_ch.png" doc: | - Clonotypes diversity, - colored by cluster, - split by dataset - PDF format + Proportion of top shared clonotypes between + grouping conditions. + Split by chain; filtered by minimum clonotype + frequency per donor; top clonotypes selected from + each grouping condition. + PNG format. - dvrs_gr_idnt_spl_clst_plot_png: + hmst_gr_cnd_spl_ch_plot_png: type: File? outputBinding: - glob: "*_dvrs_gr_idnt_spl_clst.png" + glob: "*_hmst_gr_cnd_spl_ch.png" doc: | - Clonotypes diversity, - colored by dataset, - split by cluster - PNG format + Proportion of clonotype frequencies per + grouping condition. + Split by chain; not filtered by clonotype frequency. + PNG format. - dvrs_gr_idnt_spl_clst_plot_pdf: + vrlp_gr_cnd_spl_ch_plot_png: type: File? outputBinding: - glob: "*_dvrs_gr_idnt_spl_clst.pdf" - doc: | - Clonotypes diversity, - colored by dataset, - split by cluster - PDF format - - gene_spl_clst_vdjc_plot_png: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_gene_spl_clst_*.png" + glob: "*_vrlp_gr_cnd_spl_ch.png" doc: | - Relative usage of V, D, J, C - genes, split by cluster - PNG format + Overlap of clonotypes between grouping conditions. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. - gene_spl_clst_vdjc_plot_pdf: - type: - - "null" - - type: array - items: File + dvrs_gr_cnd_spl_ch_plot_png: + type: File? outputBinding: - glob: "*_gene_spl_clst_*.pdf" + glob: "*_dvrs_gr_cnd_spl_ch.png" doc: | - Relative usage of V, D, J, C - genes, split by cluster - PDF format + Diversity of clonotypes per grouping condition. + Split by chain; not filtered by clonotype frequency. + PNG format. - gene_spl_idnt_vdjc_plot_png: - type: - - "null" - - type: array - items: File + gene_gr_cnd_spl_ch_plot_png: + type: File? outputBinding: - glob: "*_gene_spl_idnt_*.png" + glob: "*_gene_gr_cnd_spl_ch.png" doc: | - Relative usage of V, D, J, C - genes, split by dataset - PNG format + Distribution of gene usage per grouping condition. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. - gene_spl_idnt_vdjc_plot_pdf: + all_plots_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_gene_spl_idnt_*.pdf" - doc: | - Relative usage of V, D, J, C - genes, split by dataset - PDF format - - chrd_gr_clst_plot_png: - type: File? - outputBinding: - glob: "*_chrd_gr_clst.png" - doc: | - Shared clonotype, - colored by cluster - PNG format - - chrd_gr_clst_plot_pdf: - type: File? - outputBinding: - glob: "*_chrd_gr_clst.pdf" - doc: | - Shared clonotype, - colored by cluster - PDF format - - chrd_gr_idnt_plot_png: - type: File? - outputBinding: - glob: "*_chrd_gr_idnt.png" - doc: | - Shared clonotype, - colored by dataset - PNG format - - chrd_gr_idnt_plot_pdf: - type: File? - outputBinding: - glob: "*_chrd_gr_idnt.pdf" - doc: | - Shared clonotype, - colored by dataset - PDF format - - chrd_gr_cnd_plot_png: - type: File? - outputBinding: - glob: "*_chrd_gr_cnd.png" - doc: | - Shared clonotype, - colored by grouping condition - PNG format - - chrd_gr_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_chrd_gr_cnd.pdf" - doc: | - Shared clonotype, - colored by grouping condition - PDF format - - count_spl_cnd_plot_png: - type: File? - outputBinding: - glob: "*_count_spl_cnd.png" - doc: | - Unique clonotypes, - split by grouping condition - PNG format - - count_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_count_spl_cnd.pdf" - doc: | - Unique clonotypes, - split by grouping condition - PDF format - - hmst_spl_cnd_plot_png: - type: File? - outputBinding: - glob: "*_hmst_spl_cnd.png" - doc: | - Clonal space homeostasis, - split by grouping condition - PNG format - - hmst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_hmst_spl_cnd.pdf" - doc: | - Clonal space homeostasis, - split by grouping condition - PDF format - - vrlp_spl_cnd_plot_png: - type: File? - outputBinding: - glob: "*_vrlp_spl_cnd.png" - doc: | - Clonotypes similarity, - split by grouping condition - PNG format - - vrlp_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_vrlp_spl_cnd.pdf" - doc: | - Clonotypes similarity, - split by grouping condition - PDF format - - ntwr_gr_cnd_plot_png: - type: File? - outputBinding: - glob: "*_ntwr_gr_cnd.png" - doc: | - Clonotypes network, - colored by grouping condition - PNG format - - ntwr_gr_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_ntwr_gr_cnd.pdf" - doc: | - Clonotypes network, - colored by grouping condition - PDF format - - dvrs_gr_clst_spl_cnd_plot_png: - type: File? - outputBinding: - glob: "*_dvrs_gr_clst_spl_cnd.png" - doc: | - Clonotypes diversity, - colored by cluster, - split by grouping condition - PNG format - - dvrs_gr_clst_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_dvrs_gr_clst_spl_cnd.pdf" - doc: | - Clonotypes diversity, - colored by cluster, - split by grouping condition - PDF format - - dvrs_gr_cnd_spl_clst_plot_png: - type: File? - outputBinding: - glob: "*_dvrs_gr_cnd_spl_clst.png" + glob: "*.pdf" doc: | - Clonotypes diversity, - colored by grouping condition, - split by cluster - PNG format + All generated plots. + PDF format. - dvrs_gr_cnd_spl_clst_plot_pdf: + clonotypes_data_tsv: type: File? outputBinding: - glob: "*_dvrs_gr_cnd_spl_clst.pdf" + glob: "*_clonotypes.tsv" doc: | - Clonotypes diversity, - colored by grouping condition, - split by cluster - PDF format + Clonotypes data. + Filtered by minimum clonotype + frequency per donor. + TSV format. ucsc_cb_config_data: type: Directory? @@ -662,7 +508,7 @@ outputs: glob: "*_data.rds" doc: | Seurat object. - RDS format + RDS format. seurat_data_h5seurat: type: File? @@ -670,7 +516,7 @@ outputs: glob: "*_data.h5seurat" doc: | Seurat object. - h5Seurat format + h5Seurat format. seurat_data_h5ad: type: File? @@ -678,7 +524,7 @@ outputs: glob: "*_counts.h5ad" doc: | Seurat object. - H5AD format + H5AD format. seurat_data_cloupe: type: File? @@ -686,7 +532,7 @@ outputs: glob: "*_counts.cloupe" doc: | Seurat object. - Loupe format + Loupe format. seurat_data_scope: type: File? @@ -695,7 +541,7 @@ outputs: doc: | Seurat object. SCope compatible. - Loom format + Loom format. stdout_log: type: stdout @@ -719,7 +565,7 @@ $schemas: label: "Single-Cell Immune Profiling Analysis" s:name: "Single-Cell Immune Profiling Analysis" -s:alternateName: "TCR/BCR clonotype dynamics analysis" +s:alternateName: "Single-Cell Immune Profiling Analysis" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-vdj-profile.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -759,19 +605,20 @@ s:creator: doc: | Single-Cell Immune Profiling Analysis - TCR/BCR clonotype dynamics analysis + Estimates clonotype diversity and dynamics from V(D)J + sequencing data assembled into contigs. s:about: | - usage: /usr/local/bin/sc_vdj_profile.R [-h] --query QUERY --contigs CONTIGS + sc_vdj_profile.R [-h] --query QUERY --contigs CONTIGS [--metadata METADATA] - [--barcodes BARCODES] --source SOURCE + [--barcodes BARCODES] [--cloneby {gene,nt,aa,strict}] - [--groupby GROUPBY] - [--strictness {removemulti,filtermulti}] - [--pdf] [--verbose] [--h5seurat] - [--h5ad] [--cbbuild] [--scope] - [--output OUTPUT] + [--minfrequency MINFREQUENCY] + [--filter {cells,chains}] + [--removepartial] [--pdf] [--verbose] + [--h5seurat] [--h5ad] [--loupe] + [--cbbuild] [--scope] [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] [--seed SEED] @@ -782,8 +629,11 @@ s:about: | -h, --help show this help message and exit --query QUERY Path to the RDS file to load Seurat object from. This file should include gene expression information stored - in the RNA assay, as well as 'pca' and 'rnaumap' - dimensionality reductions applied to that assay. + in the RNA assay, as well as pca and rnaumap + dimensionality reductions applied to that assay. If + loaded Seurat object includes multiple datasets, it + should have a donor column to define grouping for + clonotype calling. --contigs CONTIGS Path to the file with high-level annotations of each high-confidence contig from cell-associated barcodes from the Cell Ranger Multi or Cell Ranger Aggregate @@ -805,22 +655,22 @@ s:about: | Seurat object metadata ovewriting the existing ones if those are present. Default: all cells used, no extra metadata is added - --source SOURCE Column from the metadata of the loaded Seurat object - to select clusters from. --cloneby {gene,nt,aa,strict} Defines how to call the clonotype. gene: based on VDJC gene sequence. nt: based on the nucleotide sequence. aa: based on the amino acid sequence. strict: based on the combination of the nucleotide and gene sequences. Default: gene - --groupby GROUPBY Column from the metadata of the loaded Seurat object - to group cells for clonotype frequency calculation. - Default: group by dataset - --strictness {removemulti,filtermulti} - Apply stringency filters. Removemulti: remove any cell - with more than 2 immune receptor chains. Filtermulti: - isolate the top 2 expressed chains in cell with - multiple chains. Default: do not apply any filters. + --minfrequency MINFREQUENCY + Minimum frequency (number of cells) per clonotype to + be reported. Default: 3 + --filter {cells,chains} + Stringency filters to be applied. cells: remove cells + with more than 2 chains. chains: remove chains + exceeding 2 (select the most expressed ones). Default: + do not apply any filters. + --removepartial Remove cells with only one chain detected. Default: + keep all cells if at least one chain detected --pdf Export plots in PDF. Default: false --verbose Print debug information. Default: false --h5seurat Save Seurat data to h5seurat file. Default: false diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index b8e1cf6b..13576b10 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.36 + dockerPull: biowardrobe2/sc-tools:v0.0.37 inputs: diff --git a/workflows/sc-vdj-profile.cwl b/workflows/sc-vdj-profile.cwl index 9ac1fe1e..f149c852 100644 --- a/workflows/sc-vdj-profile.cwl +++ b/workflows/sc-vdj-profile.cwl @@ -11,6 +11,7 @@ requirements: "sd:upstream": sc_tools_sample: + - "sc-rna-reduce.cwl" - "sc-rna-cluster.cwl" - "sc-ctype-assign.cwl" sc_vdj_sample: @@ -28,14 +29,12 @@ inputs: query_data_rds: type: File - label: "Single-cell Analysis with Clustered RNA-Seq Datasets" + label: "Single-cell Analysis with PCA Transformed RNA-Seq Datasets" doc: | Analysis that includes single-cell - RNA-Seq datasets run through either - "Single-Cell Manual Cell Type - Assignment" or "Single-Cell RNA-Seq - Cluster Analysis" at any of the - processing stages. + RNA-Seq datasets run through "Single-Cell + RNA-Seq Dimensionality Reduction Analysis" + at any of the processing stages. "sd:upstreamSource": "sc_tools_sample/seurat_data_rds" "sd:localLabel": true @@ -53,17 +52,6 @@ inputs: "sd:upstreamSource": "sc_vdj_sample/filtered_contig_annotations_csv" "sd:localLabel": true - query_source_column: - type: string - label: "Cells grouping" - doc: | - Single cell metadata column to group - cells into clusters. Usually, in a form - of "[rna|atac|wsnn]_res.X", where X is - the clustering resolution. If cell types - are available, add "custom_" prefix to - the column name. - cloneby: type: - "null" @@ -84,23 +72,40 @@ inputs: the nucleotide and gene sequences. Default: gene - strictness: + filterby: type: - "null" - type: enum symbols: - - "removemulti" - - "filtermulti" + - "cells" + - "chains" - "none" default: "none" label: "Stringency filter" doc: | - Apply stringency filters. removemulti: - remove any cell with more than 2 immune - receptor chains. filtermulti: isolate - the top 2 expressed chains in cell with - multiple chains. none: do not apply any - filters. Default: none + Applies stringency filters. 1) cells: + removes cells with more than 2 chains. + 2) chains: removes chains exceeding 2 + (selects the most expressed ones). + Default: none. + + remove_partial: + type: boolean? + default: false + label: "Remove cells with only one chain detected" + doc: | + Remove cells with only one chain detected. + Default: keep all cells if at least one + chain detected + + minimum_frequency: + type: int? + default: 3 + label: "Minimum clonotype frequency" + doc: | + Minimum frequency (number of cells) per + clonotype to be reported. + Default: 3 datasets_metadata: type: File? @@ -115,13 +120,15 @@ inputs: "library_id" as the first column and any number of additional columns with unique names, representing the desired - grouping categories. -# To obtain a proper this is not available yet, because we didn't refactor sc-rna-filter pipeline -# template of this file, download -# "datasets_metadata.tsv" output from the -# "Files" tab of the selected "Single-cell -# Analysis with Filtered RNA-Seq Datasets" -# and add extra columns as needed. + grouping categories. To obtain a proper + template of this file, download + "datasets_metadata.tsv" output from the + "Files" tab of the filtering step that + was run before the selected "Single-cell + Analysis with PCA Transformed RNA-Seq + Datasets" (a.k.a "Single-cell Analysis + with Filtered RNA-Seq Datasets") and add + extra columns as needed. barcodes_data: type: File? @@ -183,7 +190,7 @@ inputs: - "3" - "4" - "5" - - "6" + - "4" default: "4" label: "Cores/CPUs" doc: | @@ -197,269 +204,328 @@ inputs: outputs: - count_spl_idnt_plot_png: + umap_cl_freq_spl_ch_plot_png: type: File? - outputSource: vdj_profile/count_spl_idnt_plot_png - label: "Unique clonotypes, split by dataset" + outputSource: vdj_profile/umap_cl_freq_spl_ch_plot_png + label: "UMAP colored by clonotype frequency (split by chain, filtered by minimum frequency)" doc: | - Unique clonotypes, - split by dataset + UMAP colored by clonotype frequency. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. "sd:visualPlugins": - image: - tab: "Per dataset" - Caption: "Unique clonotypes, split by dataset" + tab: "UMAP" + Caption: "UMAP colored by clonotype frequency (split by chain, filtered by minimum frequency)" - hmst_spl_idnt_plot_png: + hmst_gr_idnt_spl_ch_plot_png: type: File? - outputSource: vdj_profile/hmst_spl_idnt_plot_png - label: "Clonal space homeostasis, split by dataset" + outputSource: vdj_profile/hmst_gr_idnt_spl_ch_plot_png + label: "Proportion of clonotype frequencies per dataset (split by chain, not filtered by minimum frequency)" doc: | - Clonal space homeostasis, - split by dataset + Proportion of clonotype frequencies per dataset. + Split by chain; not filtered by clonotype frequency. + PNG format. "sd:visualPlugins": - image: - tab: "Per dataset" - Caption: "Clonal space homeostasis, split by dataset" + tab: "Grouped by dataset" + Caption: "Proportion of clonotype frequencies per dataset (split by chain, not filtered by minimum frequency)" - vrlp_spl_idnt_plot_png: + dvrs_gr_idnt_spl_ch_plot_png: type: File? - outputSource: vdj_profile/vrlp_spl_idnt_plot_png - label: "Clonotypes similarity, split by dataset" + outputSource: vdj_profile/dvrs_gr_idnt_spl_ch_plot_png + label: "Diversity of clonotypes per dataset (split by chain, not filtered by minimum frequency)" doc: | - Clonotypes similarity, - split by dataset + Diversity of clonotypes per dataset. + Split by chain; not filtered by clonotype frequency. + PNG format. "sd:visualPlugins": - image: - tab: "Per dataset" - Caption: "Clonotypes similarity, split by dataset" + tab: "Grouped by dataset" + Caption: "Diversity of clonotypes per dataset (split by chain, not filtered by minimum frequency)" - ntwr_gr_idnt_plot_png: + vrlp_gr_idnt_spl_ch_plot_png: type: File? - outputSource: vdj_profile/ntwr_gr_idnt_plot_png - label: "Clonotypes network, colored by dataset" + outputSource: vdj_profile/vrlp_gr_idnt_spl_ch_plot_png + label: "Overlap of clonotypes between datasets (split by chain, filtered by minimum frequency)" doc: | - Clonotypes network, - colored by dataset + Overlap of clonotypes between datasets. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. "sd:visualPlugins": - image: - tab: "Per dataset" - Caption: "Clonotypes network, colored by dataset" + tab: "Grouped by dataset" + Caption: "Overlap of clonotypes between datasets (split by chain, filtered by minimum frequency)" - dvrs_gr_clst_spl_idnt_plot_png: + allu_gr_idnt_spl_ch_plot_png: type: File? - outputSource: vdj_profile/dvrs_gr_clst_spl_idnt_plot_png - label: "Clonotypes diversity, colored by cluster, split by dataset" - doc: | - Clonotypes diversity, - colored by cluster, - split by dataset + outputSource: vdj_profile/allu_gr_idnt_spl_ch_plot_png + label: "Proportion of top shared clonotypes between datasets (split by chain, filtered by minimum frequency)" + doc: | + Proportion of top shared clonotypes between datasets. + Split by chain; filtered by minimum clonotype + frequency per donor; top clonotypes selected from + each dataset. + PNG format. "sd:visualPlugins": - image: - tab: "Per dataset" - Caption: "Clonotypes diversity, colored by cluster, split by dataset" + tab: "Grouped by dataset" + Caption: "Proportion of top shared clonotypes between datasets (split by chain, filtered by minimum frequency)" - chrd_gr_idnt_plot_png: + cl_qnt_gr_idnt_spl_ch_plot_png: type: File? - outputSource: vdj_profile/chrd_gr_idnt_plot_png - label: "Shared clonotype, colored by dataset" + outputSource: vdj_profile/cl_qnt_gr_idnt_spl_ch_plot_png + label: "Percentage of unique clonotypes per dataset (split by chain, filtered by minimum frequency)" doc: | - Shared clonotype, - colored by dataset + Percentage of unique clonotypes per dataset. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. "sd:visualPlugins": - image: - tab: "Per dataset" - Caption: "Shared clonotype, colored by dataset" + tab: "Grouped by dataset" + Caption: "Percentage of unique clonotypes per dataset (split by chain, filtered by minimum frequency)" - gene_spl_idnt_vdjc_plot_png: - type: - - "null" - - type: array - items: File - outputSource: vdj_profile/gene_spl_idnt_vdjc_plot_png - label: "Relative usage of V, D, J, C genes, split by dataset" + gene_gr_idnt_spl_ch_plot_png: + type: File? + outputSource: vdj_profile/gene_gr_idnt_spl_ch_plot_png + label: "Distribution of gene usage per dataset (split by chain, filtered by minimum frequency)" doc: | - Relative usage of V, D, J, C - genes, split by dataset + Distribution of gene usage per dataset. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. "sd:visualPlugins": - image: - tab: "Per dataset" - Caption: "Relative usage of V, D, J, C genes, split by dataset" + tab: "Grouped by dataset" + Caption: "Distribution of gene usage per dataset (split by chain, filtered by minimum frequency)" - count_spl_clst_plot_png: + cl_dnst_gr_idnt_spl_ch_plot_png: type: File? - outputSource: vdj_profile/count_spl_clst_plot_png - label: "Unique clonotypes, split by cluster" + outputSource: vdj_profile/cl_dnst_gr_idnt_spl_ch_plot_png + label: "Distribution of clonotype frequencies per dataset (split by chain, filtered by minimum frequency)" doc: | - Unique clonotypes, - split by cluster + Distribution of clonotype frequencies per dataset. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. "sd:visualPlugins": - image: - tab: "Per cluster" - Caption: "Unique clonotypes, split by cluster" + tab: "Grouped by dataset" + Caption: "Distribution of clonotype frequencies per dataset (split by chain, filtered by minimum frequency)" - hmst_spl_clst_plot_png: + hmst_gr_dnr_spl_ch_plot_png: type: File? - outputSource: vdj_profile/hmst_spl_clst_plot_png - label: "Clonal space homeostasis, split by cluster" + outputSource: vdj_profile/hmst_gr_dnr_spl_ch_plot_png + label: "Proportion of clonotype frequencies per donor (split by chain, not filtered by minimum frequency)" doc: | - Clonal space homeostasis, - split by cluster + Proportion of clonotype frequencies per donor. + Split by chain; not filtered by clonotype frequency. + PNG format. "sd:visualPlugins": - image: - tab: "Per cluster" - Caption: "Clonal space homeostasis, split by cluster" + tab: "Grouped by donor" + Caption: "Proportion of clonotype frequencies per donor (split by chain, not filtered by minimum frequency)" - vrlp_spl_clst_plot_png: + dvrs_gr_dnr_spl_ch_plot_png: type: File? - outputSource: vdj_profile/vrlp_spl_clst_plot_png - label: "Clonotypes similarity, split by cluster" + outputSource: vdj_profile/dvrs_gr_dnr_spl_ch_plot_png + label: "Diversity of clonotypes per donor (split by chain, not filtered by minimum frequency)" doc: | - Clonotypes similarity, - split by cluster + Diversity of clonotypes per donor. + Split by chain; not filtered by clonotype frequency. + PNG format. "sd:visualPlugins": - image: - tab: "Per cluster" - Caption: "Clonotypes similarity, split by cluster" + tab: "Grouped by donor" + Caption: "Diversity of clonotypes per donor (split by chain, not filtered by minimum frequency)" - ntwr_gr_clst_plot_png: + vrlp_gr_dnr_spl_ch_plot_png: type: File? - outputSource: vdj_profile/ntwr_gr_clst_plot_png - label: "Clonotypes network, colored by cluster" + outputSource: vdj_profile/vrlp_gr_dnr_spl_ch_plot_png + label: "Overlap of clonotypes between donors (split by chain, filtered by minimum frequency)" doc: | - Clonotypes network, - colored by cluster + Overlap of clonotypes between donors. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. "sd:visualPlugins": - image: - tab: "Per cluster" - Caption: "Clonotypes network, colored by cluster" + tab: "Grouped by donor" + Caption: "Overlap of clonotypes between donors (split by chain, filtered by minimum frequency)" - dvrs_gr_idnt_spl_clst_plot_png: + allu_gr_dnr_spl_ch_plot_png: type: File? - outputSource: vdj_profile/dvrs_gr_idnt_spl_clst_plot_png - label: "Clonotypes diversity, colored by dataset, split by cluster" - doc: | - Clonotypes diversity, - colored by dataset, - split by cluster + outputSource: vdj_profile/allu_gr_dnr_spl_ch_plot_png + label: "Proportion of top shared clonotypes between donors (split by chain, filtered by minimum frequency)" + doc: | + Proportion of top shared clonotypes between donors. + Split by chain; filtered by minimum clonotype + frequency per donor; top clonotypes selected from + each donor. + PNG format. "sd:visualPlugins": - image: - tab: "Per cluster" - Caption: "Clonotypes diversity, colored by dataset, split by cluster" + tab: "Grouped by donor" + Caption: "Proportion of top shared clonotypes between donors (split by chain, filtered by minimum frequency)" - chrd_gr_clst_plot_png: + cl_qnt_gr_dnr_spl_ch_plot_png: type: File? - outputSource: vdj_profile/chrd_gr_clst_plot_png - label: "Shared clonotype, colored by cluster" + outputSource: vdj_profile/cl_qnt_gr_dnr_spl_ch_plot_png + label: "Percentage of unique clonotypes per donor (split by chain, filtered by minimum frequency)" doc: | - Shared clonotype, - colored by cluster + Percentage of unique clonotypes per donor. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. "sd:visualPlugins": - image: - tab: "Per cluster" - Caption: "Shared clonotype, colored by cluster" + tab: "Grouped by donor" + Caption: "Percentage of unique clonotypes per donor (split by chain, filtered by minimum frequency)" - gene_spl_clst_vdjc_plot_png: - type: - - "null" - - type: array - items: File - outputSource: vdj_profile/gene_spl_clst_vdjc_plot_png - label: "Relative usage of V, D, J, C genes, split by cluster" + gene_gr_dnr_spl_ch_plot_png: + type: File? + outputSource: vdj_profile/gene_gr_dnr_spl_ch_plot_png + label: "Distribution of gene usage per donor (split by chain, filtered by minimum frequency)" doc: | - Relative usage of V, D, J, C - genes, split by cluster + Distribution of gene usage per donor. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. "sd:visualPlugins": - image: - tab: "Per cluster" - Caption: "Relative usage of V, D, J, C genes, split by cluster" + tab: "Grouped by donor" + Caption: "Distribution of gene usage per donor (split by chain, filtered by minimum frequency)" - count_spl_cnd_plot_png: + cl_dnst_gr_dnr_spl_ch_plot_png: type: File? - outputSource: vdj_profile/count_spl_cnd_plot_png - label: "Unique clonotypes, split by grouping condition" + outputSource: vdj_profile/cl_dnst_gr_dnr_spl_ch_plot_png + label: "Distribution of clonotype frequencies per donor (split by chain, filtered by minimum frequency)" doc: | - Unique clonotypes, - split by grouping - condition + Distribution of clonotype frequencies per donor. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. "sd:visualPlugins": - image: - tab: "Per group" - Caption: "Unique clonotypes, split by grouping condition" + tab: "Grouped by donor" + Caption: "Distribution of clonotype frequencies per donor (split by chain, filtered by minimum frequency)" - hmst_spl_cnd_plot_png: + hmst_gr_cnd_spl_ch_plot_png: type: File? - outputSource: vdj_profile/hmst_spl_cnd_plot_png - label: "Clonal space homeostasis, split by grouping condition" + outputSource: vdj_profile/hmst_gr_cnd_spl_ch_plot_png + label: "Proportion of clonotype frequencies per grouping condition (split by chain, not filtered by minimum frequency)" doc: | - Clonal space homeostasis, - split by grouping condition + Proportion of clonotype frequencies per + grouping condition. + Split by chain; not filtered by clonotype frequency. + PNG format. "sd:visualPlugins": - image: - tab: "Per group" - Caption: "Clonal space homeostasis, split by grouping condition" + tab: "Grouped by condition" + Caption: "Proportion of clonotype frequencies per grouping condition (split by chain, not filtered by minimum frequency)" - vrlp_spl_cnd_plot_png: + dvrs_gr_cnd_spl_ch_plot_png: type: File? - outputSource: vdj_profile/vrlp_spl_cnd_plot_png - label: "Clonotypes similarity, split by grouping condition" + outputSource: vdj_profile/dvrs_gr_cnd_spl_ch_plot_png + label: "Diversity of clonotypes per grouping condition (split by chain, not filtered by minimum frequency)" doc: | - Clonotypes similarity, - split by grouping condition + Diversity of clonotypes per grouping condition. + Split by chain; not filtered by clonotype frequency. + PNG format. "sd:visualPlugins": - image: - tab: "Per group" - Caption: "Clonotypes similarity, split by grouping condition" + tab: "Grouped by condition" + Caption: "Diversity of clonotypes per grouping condition (split by chain, not filtered by minimum frequency)" - ntwr_gr_cnd_plot_png: + vrlp_gr_cnd_spl_ch_plot_png: type: File? - outputSource: vdj_profile/ntwr_gr_cnd_plot_png - label: "Clonotypes network, colored by grouping condition" + outputSource: vdj_profile/vrlp_gr_cnd_spl_ch_plot_png + label: "Overlap of clonotypes between grouping conditions (split by chain, filtered by minimum frequency)" doc: | - Clonotypes network, - colored by grouping condition + Overlap of clonotypes between grouping conditions. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. "sd:visualPlugins": - image: - tab: "Per group" - Caption: "Clonotypes network, colored by grouping condition" + tab: "Grouped by condition" + Caption: "Overlap of clonotypes between grouping conditions (split by chain, filtered by minimum frequency)" - dvrs_gr_clst_spl_cnd_plot_png: + allu_gr_cnd_spl_ch_plot_png: type: File? - outputSource: vdj_profile/dvrs_gr_clst_spl_cnd_plot_png - label: "Clonotypes diversity, colored by cluster, split by grouping condition" - doc: | - Clonotypes diversity, - colored by cluster, - split by grouping condition + outputSource: vdj_profile/allu_gr_cnd_spl_ch_plot_png + label: "Proportion of top shared clonotypes between grouping conditions (split by chain, filtered by minimum frequency)" + doc: | + Proportion of top shared clonotypes between + grouping conditions. + Split by chain; filtered by minimum clonotype + frequency per donor; top clonotypes selected from + each grouping condition. + PNG format. "sd:visualPlugins": - image: - tab: "Per group" - Caption: "Clonotypes diversity, colored by cluster, split by grouping condition" + tab: "Grouped by condition" + Caption: "Proportion of top shared clonotypes between grouping conditions (split by chain, filtered by minimum frequency)" - dvrs_gr_cnd_spl_clst_plot_png: + cl_qnt_gr_cnd_spl_ch_plot_png: type: File? - outputSource: vdj_profile/dvrs_gr_cnd_spl_clst_plot_png - label: "Clonotypes diversity, colored by grouping condition, split by cluster" - doc: | - Clonotypes diversity, - colored by grouping condition, - split by cluster + outputSource: vdj_profile/cl_qnt_gr_cnd_spl_ch_plot_png + label: "Percentage of unique clonotypes per grouping condition (split by chain, filtered by minimum frequency)" + doc: | + Percentage of unique clonotypes per + grouping condition. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. "sd:visualPlugins": - image: - tab: "Per group" - Caption: "Clonotypes diversity, colored by grouping condition, split by cluster" + tab: "Grouped by condition" + Caption: "Percentage of unique clonotypes per grouping condition (split by chain, filtered by minimum frequency)" - chrd_gr_cnd_plot_png: + gene_gr_cnd_spl_ch_plot_png: type: File? - outputSource: vdj_profile/chrd_gr_cnd_plot_png - label: "Shared clonotype, colored by grouping condition" + outputSource: vdj_profile/gene_gr_cnd_spl_ch_plot_png + label: "Distribution of gene usage per grouping condition (split by chain, filtered by minimum frequency)" doc: | - Shared clonotype, - colored by grouping - condition + Distribution of gene usage per grouping condition. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. + "sd:visualPlugins": + - image: + tab: "Grouped by condition" + Caption: "Distribution of gene usage per grouping condition (split by chain, filtered by minimum frequency)" + + cl_dnst_gr_cnd_spl_ch_plot_png: + type: File? + outputSource: vdj_profile/cl_dnst_gr_cnd_spl_ch_plot_png + label: "Distribution of clonotype frequencies per grouping condition (split by chain, filtered by minimum frequency)" + doc: | + Distribution of clonotype frequencies + per grouping condition. + Split by chain; filtered by minimum clonotype + frequency per donor. + PNG format. "sd:visualPlugins": - image: - tab: "Per group" - Caption: "Shared clonotype, colored by grouping condition" + tab: "Grouped by condition" + Caption: "Distribution of clonotype frequencies per grouping condition (split by chain, filtered by minimum frequency)" + + clonotypes_data_tsv: + type: File? + outputSource: vdj_profile/clonotypes_data_tsv + label: "Clonotypes (filtered by minimum frequency)" + doc: | + Clonotypes data. + Filtered by minimum clonotype + frequency per donor. + TSV format. + "sd:visualPlugins": + - syncfusiongrid: + tab: "Clonotypes table" + Title: "Clonotypes (filtered by minimum frequency)" ucsc_cb_html_data: type: Directory? @@ -535,13 +601,12 @@ steps: contigs_data: contigs_data datasets_metadata: datasets_metadata barcodes_data: barcodes_data - query_source_column: query_source_column cloneby: cloneby - groupby: - default: "new.ident" - strictness: - source: strictness + minimum_frequency: minimum_frequency + filterby: + source: filterby valueFrom: $(self=="none"?null:self) + remove_partial: remove_partial color_theme: color_theme export_loupe_data: export_loupe_data export_pdf_plots: @@ -560,48 +625,30 @@ steps: source: threads valueFrom: $(parseInt(self)) out: - - count_spl_idnt_plot_png - - count_spl_idnt_plot_pdf - - count_spl_clst_plot_png - - count_spl_clst_plot_pdf - - hmst_spl_idnt_plot_png - - hmst_spl_idnt_plot_pdf - - hmst_spl_clst_plot_png - - hmst_spl_clst_plot_pdf - - vrlp_spl_clst_plot_png - - vrlp_spl_clst_plot_pdf - - vrlp_spl_idnt_plot_png - - vrlp_spl_idnt_plot_pdf - - ntwr_gr_clst_plot_png - - ntwr_gr_clst_plot_pdf - - ntwr_gr_idnt_plot_png - - ntwr_gr_idnt_plot_pdf - - dvrs_gr_clst_spl_idnt_plot_png - - dvrs_gr_clst_spl_idnt_plot_pdf - - dvrs_gr_idnt_spl_clst_plot_png - - dvrs_gr_idnt_spl_clst_plot_pdf - - gene_spl_clst_vdjc_plot_png - - gene_spl_clst_vdjc_plot_pdf - - gene_spl_idnt_vdjc_plot_png - - gene_spl_idnt_vdjc_plot_pdf - - chrd_gr_clst_plot_png - - chrd_gr_clst_plot_pdf - - chrd_gr_idnt_plot_png - - chrd_gr_idnt_plot_pdf - - chrd_gr_cnd_plot_png - - chrd_gr_cnd_plot_pdf - - count_spl_cnd_plot_png - - count_spl_cnd_plot_pdf - - hmst_spl_cnd_plot_png - - hmst_spl_cnd_plot_pdf - - vrlp_spl_cnd_plot_png - - vrlp_spl_cnd_plot_pdf - - ntwr_gr_cnd_plot_png - - ntwr_gr_cnd_plot_pdf - - dvrs_gr_clst_spl_cnd_plot_png - - dvrs_gr_clst_spl_cnd_plot_pdf - - dvrs_gr_cnd_spl_clst_plot_png - - dvrs_gr_cnd_spl_clst_plot_pdf + - cl_qnt_gr_idnt_spl_ch_plot_png + - cl_dnst_gr_idnt_spl_ch_plot_png + - allu_gr_idnt_spl_ch_plot_png + - hmst_gr_idnt_spl_ch_plot_png + - vrlp_gr_idnt_spl_ch_plot_png + - dvrs_gr_idnt_spl_ch_plot_png + - gene_gr_idnt_spl_ch_plot_png + - umap_cl_freq_spl_ch_plot_png + - cl_qnt_gr_dnr_spl_ch_plot_png + - cl_dnst_gr_dnr_spl_ch_plot_png + - allu_gr_dnr_spl_ch_plot_png + - hmst_gr_dnr_spl_ch_plot_png + - vrlp_gr_dnr_spl_ch_plot_png + - dvrs_gr_dnr_spl_ch_plot_png + - gene_gr_dnr_spl_ch_plot_png + - cl_qnt_gr_cnd_spl_ch_plot_png + - cl_dnst_gr_cnd_spl_ch_plot_png + - allu_gr_cnd_spl_ch_plot_png + - hmst_gr_cnd_spl_ch_plot_png + - vrlp_gr_cnd_spl_ch_plot_png + - dvrs_gr_cnd_spl_ch_plot_png + - gene_gr_cnd_spl_ch_plot_png + - all_plots_pdf + - clonotypes_data_tsv - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds @@ -615,27 +662,7 @@ steps: in: input_files: source: - - vdj_profile/count_spl_idnt_plot_pdf - - vdj_profile/count_spl_clst_plot_pdf - - vdj_profile/hmst_spl_idnt_plot_pdf - - vdj_profile/hmst_spl_clst_plot_pdf - - vdj_profile/vrlp_spl_clst_plot_pdf - - vdj_profile/vrlp_spl_idnt_plot_pdf - - vdj_profile/ntwr_gr_clst_plot_pdf - - vdj_profile/ntwr_gr_idnt_plot_pdf - - vdj_profile/dvrs_gr_clst_spl_idnt_plot_pdf - - vdj_profile/dvrs_gr_idnt_spl_clst_plot_pdf - - vdj_profile/gene_spl_clst_vdjc_plot_pdf - - vdj_profile/gene_spl_idnt_vdjc_plot_pdf - - vdj_profile/chrd_gr_clst_plot_pdf - - vdj_profile/chrd_gr_idnt_plot_pdf - - vdj_profile/chrd_gr_cnd_plot_pdf - - vdj_profile/count_spl_cnd_plot_pdf - - vdj_profile/hmst_spl_cnd_plot_pdf - - vdj_profile/vrlp_spl_cnd_plot_pdf - - vdj_profile/ntwr_gr_cnd_plot_pdf - - vdj_profile/dvrs_gr_clst_spl_cnd_plot_pdf - - vdj_profile/dvrs_gr_cnd_spl_clst_plot_pdf + - vdj_profile/all_plots_pdf valueFrom: $(self.flat().filter(n => n)) folder_basename: default: "pdf_plots" @@ -658,7 +685,7 @@ $schemas: label: "Single-Cell Immune Profiling Analysis" s:name: "Single-Cell Immune Profiling Analysis" -s:alternateName: "TCR/BCR clonotype dynamics analysis" +s:alternateName: "Single-Cell Immune Profiling Analysis" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-vdj-profile.cwl s:codeRepository: https://github.com/Barski-lab/workflows-datirium @@ -699,4 +726,4 @@ doc: | Single-Cell Immune Profiling Analysis Estimates clonotype diversity and dynamics from V(D)J - sequencing data assembled into contigs \ No newline at end of file + sequencing data assembled into contigs. \ No newline at end of file From b447b2e0fcb8165012310dee8d5fe6829f32ea71 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 25 Apr 2024 18:41:58 -0400 Subject: [PATCH 134/162] Small changes in the HOMER pipeline, otherwise it fails with Toil --- workflows/homer-motif-analysis.cwl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/workflows/homer-motif-analysis.cwl b/workflows/homer-motif-analysis.cwl index a89948f0..2a094829 100644 --- a/workflows/homer-motif-analysis.cwl +++ b/workflows/homer-motif-analysis.cwl @@ -202,6 +202,8 @@ steps: in: intervals_file: make_unique/output_file genome_fasta_file: genome_fasta_file + output_filename: + default: "target.fa" out: - sequences_file @@ -210,6 +212,8 @@ steps: in: intervals_file: bedtools_shuffle/shuffled_bed_file genome_fasta_file: genome_fasta_file + output_filename: + default: "background.fa" out: - sequences_file From 539edd840cc01cc3464af851907f1b9cd49dd9a8 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 26 Apr 2024 11:26:26 -0400 Subject: [PATCH 135/162] Update MAnorm pipelines to take annotation from genome indices upstream, otherwise IGV doesn't work --- workflows/manorm-pe.cwl | 7 ++++--- workflows/manorm-se.cwl | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/workflows/manorm-pe.cwl b/workflows/manorm-pe.cwl index 9e968387..11a554b8 100644 --- a/workflows/manorm-pe.cwl +++ b/workflows/manorm-pe.cwl @@ -21,6 +21,8 @@ requirements: - "trim-atacseq-pe.cwl" - "cutandrun-macs2-pe.cwl" - "cutandrun-seacr-pe.cwl" + genome_indices: + - "genome-indices.cwl" inputs: @@ -85,11 +87,10 @@ inputs: annotation_file: type: File - label: "Annotation file" - format: "http://edamontology.org/format_3475" + label: "Genome type" doc: | Tab-separated annotation file - 'sd:upstreamSource': "first_chipseq_sample/genome_indices/annotation" + 'sd:upstreamSource': "genome_indices/annotation" shift_size_first: type: int? diff --git a/workflows/manorm-se.cwl b/workflows/manorm-se.cwl index 79c7bf56..2f4ed115 100644 --- a/workflows/manorm-se.cwl +++ b/workflows/manorm-se.cwl @@ -17,6 +17,8 @@ requirements: - "chipseq-se.cwl" - "trim-chipseq-se.cwl" - "trim-atacseq-se.cwl" + genome_indices: + - "genome-indices.cwl" inputs: @@ -81,11 +83,10 @@ inputs: annotation_file: type: File - label: "Annotation file" - format: "http://edamontology.org/format_3475" + label: "Genome type" doc: | Tab-separated annotation file - 'sd:upstreamSource': "first_chipseq_sample/genome_indices/annotation" + 'sd:upstreamSource': "genome_indices/annotation" shift_size_first: type: int? From 6436ef69247862a7b13eb098bfa78b72e10e5a08 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 30 Apr 2024 11:59:06 -0400 Subject: [PATCH 136/162] Add QC plots for all clustering sc pipelines --- tools/sc-atac-cluster.cwl | 125 +++-------- tools/sc-atac-filter.cwl | 12 +- tools/sc-ctype-assign.cwl | 286 ++++++++---------------- tools/sc-multiome-filter.cwl | 12 +- tools/sc-rna-cluster.cwl | 310 +++++++------------------- tools/sc-rna-filter.cwl | 8 +- tools/sc-wnn-cluster.cwl | 362 +++++++++++-------------------- workflows/sc-atac-cluster.cwl | 91 ++++++-- workflows/sc-atac-filter.cwl | 12 +- workflows/sc-ctype-assign.cwl | 169 +++++++++++---- workflows/sc-multiome-filter.cwl | 12 +- workflows/sc-rna-cluster.cwl | 128 +++++++---- workflows/sc-rna-filter.cwl | 12 +- workflows/sc-wnn-cluster.cwl | 199 +++++++++++++---- 14 files changed, 813 insertions(+), 925 deletions(-) diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index 56550096..d6332e13 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -273,105 +273,106 @@ inputs: outputs: - umap_gr_clst_res_plot_png: + cell_cnts_gr_clst_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_gr_clst_res_*.png" + glob: "*_cell_cnts_gr_clst_res_*.png" doc: | - UMAP colored by cluster. + Number of cells per cluster. All cells; all resolutions. PNG format. - umap_gr_clst_res_plot_pdf: + tss_frgm_spl_clst_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_gr_clst_res_*.pdf" + glob: "*_tss_frgm_spl_clst_res_*.png" doc: | - UMAP colored by cluster. - All cells; all resolutions. - PDF format. + TSS enrichment score vs ATAC + fragments in peaks per cell. + Split by cluster; all cells; + all resolutions. + PNG format. - slh_gr_clst_res_plot_png: + atacdbl_gr_clst_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_slh_gr_clst_res_*.png" + glob: "*_atacdbl_gr_clst_res_*.png" doc: | - Silhouette scores. + Percentage of ATAC doublets per cluster. All cells; all resolutions. PNG format. - slh_gr_clst_res_plot_pdf: + qc_mtrcs_dnst_gr_clst_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_slh_gr_clst_res_*.pdf" + glob: "*_qc_mtrcs_dnst_gr_clst_res_*.png" doc: | - Silhouette scores. + Distribution of QC metrics per cell + colored by cluster. All cells; all resolutions. - PDF format. + PNG format. - umap_gr_clst_spl_idnt_res_plot_png: + umap_gr_clst_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_gr_clst_spl_idnt_res_*.png" + glob: "*_umap_gr_clst_res_*.png" doc: | UMAP colored by cluster. - Split by dataset; downsampled to the - smallest dataset; all resolutions. + All cells; all resolutions. PNG format. - umap_gr_clst_spl_idnt_res_plot_pdf: + slh_gr_clst_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_umap_gr_clst_spl_idnt_res_*.pdf" + glob: "*_slh_gr_clst_res_*.png" doc: | - UMAP colored by cluster. - Split by dataset; downsampled to the - smallest dataset; all resolutions. - PDF format. + Silhouette scores. + All cells; all resolutions. + PNG format. - cmp_gr_clst_spl_idnt_res_plot_png: + umap_gr_clst_spl_idnt_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_cmp_gr_clst_spl_idnt_res_*.png" + glob: "*_umap_gr_clst_spl_idnt_res_*.png" doc: | - Composition plot colored by cluster. + UMAP colored by cluster. Split by dataset; downsampled to the smallest dataset; all resolutions. PNG format. - cmp_gr_clst_spl_idnt_res_plot_pdf: + cmp_gr_clst_spl_idnt_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_cmp_gr_clst_spl_idnt_res_*.pdf" + glob: "*_cmp_gr_clst_spl_idnt_res_*.png" doc: | Composition plot colored by cluster. Split by dataset; downsampled to the smallest dataset; all resolutions. - PDF format. + PNG format. cmp_gr_idnt_spl_clst_res_plot_png: type: @@ -386,19 +387,6 @@ outputs: smallest dataset; all resolutions. PNG format. - cmp_gr_idnt_spl_clst_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_cmp_gr_idnt_spl_clst_res_*.pdf" - doc: | - Composition plot colored by dataset. - Split by cluster; downsampled to the - smallest dataset; all resolutions. - PDF format. - umap_gr_clst_spl_cnd_res_plot_png: type: - "null" @@ -413,20 +401,6 @@ outputs: the smallest group; all resolutions. PNG format. - umap_gr_clst_spl_cnd_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_umap_gr_clst_spl_cnd_res_*.pdf" - doc: | - UMAP colored by cluster. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group; all resolutions. - PDF format. - cmp_gr_clst_spl_cnd_res_plot_png: type: - "null" @@ -441,20 +415,6 @@ outputs: the smallest group; all resolutions. PNG format. - cmp_gr_clst_spl_cnd_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_cmp_gr_clst_spl_cnd_res_*.pdf" - doc: | - Composition plot colored by cluster. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group; all resolutions. - PDF format. - cmp_gr_cnd_spl_clst_res_plot_png: type: - "null" @@ -469,20 +429,6 @@ outputs: smallest group; all resolutions. PNG format. - cmp_gr_cnd_spl_clst_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_cmp_gr_cnd_spl_clst_res_*.pdf" - doc: | - Composition plot colored by grouping condition. - Split by cluster; first downsampled to the - smallest dataset, then downsampled to the - smallest group; all resolutions. - PDF format. - cvrg_res_plot_png: type: - "null" @@ -495,16 +441,15 @@ outputs: All genes of interest; all resolutions. PNG format. - cvrg_res_plot_pdf: + all_plots_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_cvrg_res_*.pdf" + glob: "*.pdf" doc: | - ATAC fragment coverage. - All genes of interest; all resolutions. + All generated plots. PDF format. peak_markers_tsv: diff --git a/tools/sc-atac-filter.cwl b/tools/sc-atac-filter.cwl index 288dc0f4..95b574be 100644 --- a/tools/sc-atac-filter.cwl +++ b/tools/sc-atac-filter.cwl @@ -359,10 +359,10 @@ outputs: Unfiltered; PC2/PC3. PNG format. - raw_cells_count_plot_png: + raw_cell_cnts_plot_png: type: File? outputBinding: - glob: "*_raw_cells_count.png" + glob: "*_raw_cell_cnts.png" doc: | Number of cells per dataset. Unfiltered. @@ -494,10 +494,10 @@ outputs: PC2/PC3. PNG format. - mid_fltr_cells_count_plot_png: + mid_fltr_cell_cnts_plot_png: type: File? outputBinding: - glob: "*_mid_fltr_cells_count.png" + glob: "*_mid_fltr_cell_cnts.png" doc: | Number of cells per dataset. Unfiltered, after MACS2 peak calling. @@ -632,10 +632,10 @@ outputs: Filtered; PC2/PC3. PNG format. - fltr_cells_count_plot_png: + fltr_cell_cnts_plot_png: type: File? outputBinding: - glob: "*[!_mid]_fltr_cells_count.png" + glob: "*[!_mid]_fltr_cell_cnts.png" doc: | Number of cells per dataset. Filtered. diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index b2a7281c..e7498480 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -351,63 +351,118 @@ inputs: outputs: - umap_gr_ctyp_plot_png: + cell_cnts_gr_ctyp_plot_png: type: File? outputBinding: - glob: "*_umap_gr_ctyp.png" + glob: "*_cell_cnts_gr_ctyp.png" doc: | - UMAP colored by cell type. + Number of cells per cell type. All cells. PNG format. - umap_gr_ctyp_plot_pdf: + gene_umi_spl_ctyp_plot_png: type: File? outputBinding: - glob: "*_umap_gr_ctyp.pdf" + glob: "*_gene_umi_spl_ctyp.png" doc: | - UMAP colored by cell type. + Genes vs RNA reads per cell. + Split by cell type; all cells. + PNG format. + + umi_mito_spl_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_umi_mito_spl_ctyp.png" + doc: | + RNA reads vs mitochondrial % per cell. + Split by cell type; all cells. + PNG format. + + rnadbl_gr_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_rnadbl_gr_ctyp.png" + doc: | + Percentage of RNA doublets per cell type. All cells. - PDF format. + PNG format. - umap_gr_ctyp_spl_idnt_plot_png: + tss_frgm_spl_ctyp_plot_png: type: File? outputBinding: - glob: "*_umap_gr_ctyp_spl_idnt.png" + glob: "*_tss_frgm_spl_ctyp.png" doc: | - UMAP colored by cell type. - Split by dataset; downsampled to the - smallest dataset. + TSS enrichment score vs ATAC + fragments in peaks per cell. + Split by cell type; all cells. + PNG format. + + atacdbl_gr_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_atacdbl_gr_ctyp.png" + doc: | + Percentage of ATAC doublets per cell type. + All cells. PNG format. - umap_gr_ctyp_spl_idnt_plot_pdf: + rna_atac_cnts_spl_ctyp_plot_png: type: File? outputBinding: - glob: "*_umap_gr_ctyp_spl_idnt.pdf" + glob: "*_rna_atac_cnts_spl_ctyp.png" + doc: | + RNA reads vs ATAC fragments in peaks per cell. + Split by cell type; all cells. + PNG format. + + vrlpdbl_gr_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_vrlpdbl_gr_ctyp.png" + doc: | + Percentage of RNA and ATAC doublets + per cell type. + All cells. + PNG format. + + qc_mtrcs_dnst_gr_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_qc_mtrcs_dnst_gr_ctyp.png" + doc: | + Distribution of QC metrics per cell + colored by cell type. + All cells. + PNG format. + + umap_gr_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_ctyp.png" doc: | UMAP colored by cell type. - Split by dataset; downsampled to the - smallest dataset. - PDF format. + All cells. + PNG format. - cmp_gr_ctyp_spl_idnt_plot_png: + umap_gr_ctyp_spl_idnt_plot_png: type: File? outputBinding: - glob: "*_cmp_gr_ctyp_spl_idnt.png" + glob: "*_umap_gr_ctyp_spl_idnt.png" doc: | - Composition plot colored by cell type. + UMAP colored by cell type. Split by dataset; downsampled to the smallest dataset. PNG format. - cmp_gr_ctyp_spl_idnt_plot_pdf: + cmp_gr_ctyp_spl_idnt_plot_png: type: File? outputBinding: - glob: "*_cmp_gr_ctyp_spl_idnt.pdf" + glob: "*_cmp_gr_ctyp_spl_idnt.png" doc: | Composition plot colored by cell type. Split by dataset; downsampled to the smallest dataset. - PDF format. + PNG format. cmp_gr_idnt_spl_ctyp_plot_png: type: File? @@ -419,16 +474,6 @@ outputs: the smallest dataset. PNG format. - cmp_gr_idnt_spl_ctyp_plot_pdf: - type: File? - outputBinding: - glob: "*_cmp_gr_idnt_spl_ctyp.pdf" - doc: | - Composition plot colored by dataset. - Split by cell type; downsampled to - the smallest dataset. - PDF format. - umap_gr_ph_spl_idnt_plot_png: type: File? outputBinding: @@ -439,16 +484,6 @@ outputs: smallest dataset. PNG format. - umap_gr_ph_spl_idnt_plot_pdf: - type: File? - outputBinding: - glob: "*_umap_gr_ph_spl_idnt.pdf" - doc: | - UMAP colored by cell cycle phase. - Split by dataset; downsampled to the - smallest dataset. - PDF format. - cmp_gr_ph_spl_idnt_plot_png: type: File? outputBinding: @@ -459,16 +494,6 @@ outputs: dataset. PNG format. - cmp_gr_ph_spl_idnt_plot_pdf: - type: File? - outputBinding: - glob: "*_cmp_gr_ph_spl_idnt.pdf" - doc: | - Composition plot colored by cell cycle phase. - Split by dataset; downsampled to the smallest - dataset. - PDF format. - umap_gr_ctyp_spl_ph_png: type: File? outputBinding: @@ -480,17 +505,6 @@ outputs: datasets are analyzed jointly). PNG format. - umap_gr_ctyp_spl_ph_plot_pdf: - type: File? - outputBinding: - glob: "*_umap_gr_ctyp_spl_ph.pdf" - doc: | - UMAP colored by cell type. - Split by cell cycle phase; downsampled - to the smallest dataset (if multiple - datasets are analyzed jointly). - PDF format. - cmp_gr_ph_spl_ctyp_png: type: File? outputBinding: @@ -502,17 +516,6 @@ outputs: analyzed jointly). PNG format. - cmp_gr_ph_spl_ctyp_plot_pdf: - type: File? - outputBinding: - glob: "*_cmp_gr_ph_spl_ctyp.pdf" - doc: | - Composition plot colored by cell cycle phase. - Split by cell type; downsampled to the - smallest dataset (if multiple datasets are - analyzed jointly). - PDF format. - umap_gr_ctyp_spl_cnd_plot_png: type: File? outputBinding: @@ -524,17 +527,6 @@ outputs: the smallest group. PNG format. - umap_gr_ctyp_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_umap_gr_ctyp_spl_cnd.pdf" - doc: | - UMAP colored by cell type. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group. - PDF format. - cmp_gr_ctyp_spl_cnd_plot_png: type: File? outputBinding: @@ -546,17 +538,6 @@ outputs: the smallest group. PNG format. - cmp_gr_ctyp_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_cmp_gr_ctyp_spl_cnd.pdf" - doc: | - Composition plot colored by cell type. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group. - PDF format. - cmp_gr_cnd_spl_ctyp_plot_png: type: File? outputBinding: @@ -568,17 +549,6 @@ outputs: smallest group. PNG format. - cmp_gr_cnd_spl_ctyp_plot_pdf: - type: File? - outputBinding: - glob: "*_cmp_gr_cnd_spl_ctyp.pdf" - doc: | - Composition plot colored by grouping condition. - Split by cell type; first downsampled to the - smallest dataset, then downsampled to the - smallest group. - PDF format. - umap_gr_ph_spl_cnd_plot_png: type: File? outputBinding: @@ -590,17 +560,6 @@ outputs: the smallest group. PNG format. - umap_gr_ph_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_umap_gr_ph_spl_cnd.pdf" - doc: | - UMAP colored by cell cycle phase. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group. - PDF format. - cmp_gr_ph_spl_cnd_plot_png: type: File? outputBinding: @@ -612,17 +571,6 @@ outputs: the smallest group. PNG format. - cmp_gr_ph_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_cmp_gr_ph_spl_cnd.pdf" - doc: | - Composition plot colored by cell cycle phase. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group. - PDF format. - xpr_avg_plot_png: type: File? outputBinding: @@ -631,14 +579,6 @@ outputs: Average gene expression. PNG format. - xpr_avg_plot_pdf: - type: File? - outputBinding: - glob: "*_xpr_avg.pdf" - doc: | - Average gene expression. - PDF format. - xpr_per_cell_plot_png: type: - "null" @@ -651,18 +591,6 @@ outputs: All genes of interest. PNG format. - xpr_per_cell_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_xpr_per_cell_[!sgnl_]*.pdf" - doc: | - UMAP colored by gene expression. - All genes of interest. - PDF format. - xpr_per_cell_sgnl_plot_png: type: - "null" @@ -675,18 +603,6 @@ outputs: All genes of interest. PNG format. - xpr_per_cell_sgnl_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_xpr_per_cell_sgnl_*.pdf" - doc: | - UMAP colored by gene expression density. - All genes of interest. - PDF format. - xpr_dnst_plot_png: type: - "null" @@ -699,18 +615,6 @@ outputs: All genes of interest. PNG format. - xpr_dnst_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_xpr_dnst_*.pdf" - doc: | - Gene expression density. - All genes of interest. - PDF format. - xpr_htmp_plot_png: type: File? outputBinding: @@ -720,24 +624,6 @@ outputs: Top gene markers. PNG format. - xpr_htmp_plot_pdf: - type: File? - outputBinding: - glob: "*_xpr_htmp.pdf" - doc: | - Gene expression heatmap. - Top gene markers. - PDF format. - - xpr_htmp_tsv: - type: File? - outputBinding: - glob: "*_xpr_htmp.tsv" - doc: | - Gene expression heatmap. - Top gene markers. - TSV format. - cvrg_plot_png: type: - "null" @@ -750,18 +636,26 @@ outputs: All genes of interest. PNG format. - cvrg_plot_pdf: + all_plots_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_cvrg_*.pdf" + glob: "*.pdf" doc: | - ATAC fragment coverage. - All genes of interest. + All generated plots. PDF format. + xpr_htmp_tsv: + type: File? + outputBinding: + glob: "*_xpr_htmp.tsv" + doc: | + Gene expression heatmap. + Top gene markers. + TSV format. + gene_markers_tsv: type: File? outputBinding: diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index fe7c4732..5e001ee7 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -479,10 +479,10 @@ outputs: Unfiltered; PC2/PC3. PNG format. - raw_cells_count_plot_png: + raw_cell_cnts_plot_png: type: File? outputBinding: - glob: "*_raw_cells_count.png" + glob: "*_raw_cell_cnts.png" doc: | Number of cells per dataset. Unfiltered. @@ -735,10 +735,10 @@ outputs: PC2/PC3. PNG format. - mid_fltr_cells_count_plot_png: + mid_fltr_cell_cnts_plot_png: type: File? outputBinding: - glob: "*_mid_fltr_cells_count.png" + glob: "*_mid_fltr_cell_cnts.png" doc: | Number of cells per dataset. Unfiltered, after MACS2 peak calling. @@ -998,10 +998,10 @@ outputs: Filtered; PC2/PC3. PNG format. - fltr_cells_count_plot_png: + fltr_cell_cnts_plot_png: type: File? outputBinding: - glob: "*[!_mid]_fltr_cells_count.png" + glob: "*[!_mid]_fltr_cell_cnts.png" doc: | Number of cells per dataset. Filtered. diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index f0fb946f..6aabc6d3 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -270,25 +270,78 @@ inputs: outputs: - umap_gr_ph_spl_idnt_plot_png: - type: File? + cell_cnts_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File outputBinding: - glob: "*_umap_gr_ph_spl_idnt.png" + glob: "*_cell_cnts_gr_clst_res_*.png" doc: | - UMAP colored by cell cycle phase. - Split by dataset; downsampled to the - smallest dataset. + Number of cells per cluster. + All cells; all resolutions. + PNG format. + + gene_umi_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_gene_umi_spl_clst_res_*.png" + doc: | + Genes vs RNA reads per cell. + Split by cluster; all cells; + all resolutions. + PNG format. + + umi_mito_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umi_mito_spl_clst_res_*.png" + doc: | + RNA reads vs mitochondrial % per cell. + Split by cluster; all cells; all + resolutions. + PNG format. + + rnadbl_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_rnadbl_gr_clst_res_*.png" + doc: | + Percentage of RNA doublets per cluster. + All cells; all resolutions. PNG format. - umap_gr_ph_spl_idnt_plot_pdf: + qc_mtrcs_dnst_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_qc_mtrcs_dnst_gr_clst_res_*.png" + doc: | + Distribution of QC metrics per cell + colored by cluster. + All cells; all resolutions. + PNG format. + + umap_gr_ph_spl_idnt_plot_png: type: File? outputBinding: - glob: "*_umap_gr_ph_spl_idnt.pdf" + glob: "*_umap_gr_ph_spl_idnt.png" doc: | UMAP colored by cell cycle phase. Split by dataset; downsampled to the smallest dataset. - PDF format. + PNG format. cmp_gr_ph_spl_idnt_plot_png: type: File? @@ -298,17 +351,7 @@ outputs: Composition plot colored by cell cycle phase. Split by dataset; downsampled to the smallest dataset. - PNG format - - cmp_gr_ph_spl_idnt_plot_pdf: - type: File? - outputBinding: - glob: "*_cmp_gr_ph_spl_idnt.pdf" - doc: | - Composition plot colored by cell cycle phase. - Split by dataset; downsampled to the smallest - dataset. - PDF format + PNG format. umap_gr_ph_spl_cnd_plot_png: type: File? @@ -321,17 +364,6 @@ outputs: the smallest group. PNG format. - umap_gr_ph_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_umap_gr_ph_spl_cnd.pdf" - doc: | - UMAP colored by cell cycle phase. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group. - PDF format. - cmp_gr_ph_spl_cnd_plot_png: type: File? outputBinding: @@ -343,17 +375,6 @@ outputs: the smallest group. PNG format. - cmp_gr_ph_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_cmp_gr_ph_spl_cnd.pdf" - doc: | - Composition plot colored by cell cycle phase. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group. - PDF format. - umap_gr_clst_res_plot_png: type: - "null" @@ -364,19 +385,7 @@ outputs: doc: | UMAP colored by cluster. All cells; all resolutions. - PNG format - - umap_gr_clst_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_umap_gr_clst_res_*.pdf" - doc: | - UMAP colored by cluster. - All cells; all resolutions. - PDF format + PNG format. slh_gr_clst_res_plot_png: type: @@ -388,19 +397,7 @@ outputs: doc: | Silhouette scores. All cells; all resolutions. - PNG format - - slh_gr_clst_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_slh_gr_clst_res_*.pdf" - doc: | - Silhouette scores. - All cells; all resolutions. - PDF format + PNG format. umap_gr_clst_spl_idnt_res_plot_png: type: @@ -415,19 +412,6 @@ outputs: smallest dataset; all resolutions. PNG format. - umap_gr_clst_spl_idnt_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_umap_gr_clst_spl_idnt_res_*.pdf" - doc: | - UMAP colored by cluster. - Split by dataset; downsampled to the - smallest dataset; all resolutions. - PDF format. - cmp_gr_clst_spl_idnt_res_plot_png: type: - "null" @@ -441,19 +425,6 @@ outputs: smallest dataset; all resolutions. PNG format. - cmp_gr_clst_spl_idnt_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_cmp_gr_clst_spl_idnt_res_*.pdf" - doc: | - Composition plot colored by cluster. - Split by dataset; downsampled to the - smallest dataset; all resolutions. - PDF format. - cmp_gr_idnt_spl_clst_res_plot_png: type: - "null" @@ -467,19 +438,6 @@ outputs: smallest dataset; all resolutions. PNG format. - cmp_gr_idnt_spl_clst_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_cmp_gr_idnt_spl_clst_res_*.pdf" - doc: | - Composition plot colored by dataset. - Split by cluster; downsampled to the - smallest dataset; all resolutions. - PDF format. - umap_gr_clst_spl_ph_res_plot_png: type: - "null" @@ -495,21 +453,6 @@ outputs: resolutions. PNG format. - umap_gr_clst_spl_ph_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_umap_gr_clst_spl_ph_res_*.pdf" - doc: | - UMAP colored by cluster. - Split by cell cycle phase; downsampled - to the smallest dataset (if multiple - datasets are analyzed jointly); all - resolutions. - PDF format. - cmp_gr_ph_spl_clst_res_plot_png: type: - "null" @@ -522,21 +465,7 @@ outputs: Split by cluster; downsampled to the smallest dataset (if multiple datasets are analyzed jointly); all resolutions. - PNG format - - cmp_gr_ph_spl_clst_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_cmp_gr_ph_spl_clst_res_*.pdf" - doc: | - Composition plot colored by cell cycle phase. - Split by cluster; downsampled to the smallest - dataset (if multiple datasets are analyzed - jointly); all resolutions. - PDF format + PNG format. umap_gr_clst_spl_cnd_res_plot_png: type: @@ -552,20 +481,6 @@ outputs: the smallest group; all resolutions. PNG format. - umap_gr_clst_spl_cnd_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_umap_gr_clst_spl_cnd_res_*.pdf" - doc: | - UMAP colored by cluster. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group; all resolutions. - PDF format. - cmp_gr_clst_spl_cnd_res_plot_png: type: - "null" @@ -580,20 +495,6 @@ outputs: the smallest group; all resolutions. PNG format. - cmp_gr_clst_spl_cnd_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_cmp_gr_clst_spl_cnd_res_*.pdf" - doc: | - Composition plot colored by cluster. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group; all resolutions. - PDF format. - cmp_gr_cnd_spl_clst_res_plot_png: type: - "null" @@ -608,20 +509,6 @@ outputs: smallest group; all resolutions. PNG format. - cmp_gr_cnd_spl_clst_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_cmp_gr_cnd_spl_clst_res_*.pdf" - doc: | - Composition plot colored by grouping condition. - Split by cluster; first downsampled to the - smallest dataset, then downsampled to the - smallest group; all resolutions. - PDF format. - xpr_per_cell_plot_png: type: - "null" @@ -634,18 +521,6 @@ outputs: All genes of interest. PNG format. - xpr_per_cell_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_xpr_per_cell_[!sgnl_]*.pdf" - doc: | - UMAP colored by gene expression. - All genes of interest. - PDF format. - xpr_per_cell_sgnl_plot_png: type: - "null" @@ -658,18 +533,6 @@ outputs: All genes of interest. PNG format. - xpr_per_cell_sgnl_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_xpr_per_cell_sgnl_*.pdf" - doc: | - UMAP colored by gene expression density. - All genes of interest. - PDF format. - xpr_avg_res_plot_png: type: - "null" @@ -682,18 +545,6 @@ outputs: All resolutions. PNG format. - xpr_avg_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_xpr_avg_res_*.pdf" - doc: | - Average gene expression. - All resolutions. - PDF format. - xpr_dnst_res_plot_png: type: - "null" @@ -706,18 +557,6 @@ outputs: All genes of interest; all resolutions. PNG format. - xpr_dnst_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_xpr_dnst_res_*.pdf" - doc: | - Gene expression density. - All genes of interest; all resolutions. - PDF format. - xpr_htmp_res_plot_png: type: - "null" @@ -730,16 +569,15 @@ outputs: Top gene markers; all resolutions. PNG format. - xpr_htmp_res_plot_pdf: + all_plots_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_htmp_res_*.pdf" + glob: "*.pdf" doc: | - Gene expression heatmap. - Top gene markers; all resolutions. + All generated plots. PDF format. xpr_htmp_res_tsv: @@ -790,7 +628,7 @@ outputs: glob: "*_data.rds" doc: | Seurat object. - RDS format + RDS format. seurat_data_h5seurat: type: File? @@ -798,7 +636,7 @@ outputs: glob: "*_data.h5seurat" doc: | Seurat object. - h5Seurat format + h5Seurat format. seurat_data_h5ad: type: File? @@ -806,7 +644,7 @@ outputs: glob: "*_counts.h5ad" doc: | Seurat object. - H5AD format + H5AD format. seurat_data_cloupe: type: File? @@ -814,7 +652,7 @@ outputs: glob: "*_counts.cloupe" doc: | Seurat object. - Loupe format + Loupe format. seurat_data_scope: type: File? @@ -823,7 +661,7 @@ outputs: doc: | Seurat object. SCope compatible. - Loom format + Loom format. stdout_log: type: stdout diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index f304eaf5..5bb29826 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -302,10 +302,10 @@ outputs: Unfiltered; PC2/PC3. PNG format. - raw_cells_count_plot_png: + raw_cell_cnts_plot_png: type: File? outputBinding: - glob: "*_raw_cells_count.png" + glob: "*_raw_cell_cnts.png" doc: | Number of cells per dataset. Unfiltered. @@ -440,10 +440,10 @@ outputs: Filtered; PC2/PC3. PNG format. - fltr_cells_count_plot_png: + fltr_cell_cnts_plot_png: type: File? outputBinding: - glob: "*_fltr_cells_count.png" + glob: "*_fltr_cell_cnts.png" doc: | Number of cells per dataset. Filtered. diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index 13576b10..b5cfb4a0 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -425,25 +425,129 @@ inputs: outputs: - umap_gr_ph_spl_idnt_plot_png: - type: File? + cell_cnts_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File outputBinding: - glob: "*_umap_gr_ph_spl_idnt.png" + glob: "*_cell_cnts_gr_clst_res_*.png" doc: | - UMAP colored by cell cycle phase. - Split by dataset; downsampled to the - smallest dataset. + Number of cells per cluster. + All cells; all resolutions. + PNG format. + + gene_umi_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_gene_umi_spl_clst_res_*.png" + doc: | + Genes vs RNA reads per cell. + Split by cluster; all cells; + all resolutions. + PNG format. + + umi_mito_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_umi_mito_spl_clst_res_*.png" + doc: | + RNA reads vs mitochondrial % per cell. + Split by cluster; all cells; all + resolutions. + PNG format. + + rna_atac_cnts_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_rna_atac_cnts_spl_clst_res_*.png" + doc: | + RNA reads vs ATAC fragments in peaks per cell. + Split by cluster; all cells; all resolutions. + PNG format. + + tss_frgm_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_tss_frgm_spl_clst_res_*.png" + doc: | + TSS enrichment score vs ATAC + fragments in peaks per cell. + Split by cluster; all cells; + all resolutions. + PNG format. + + rnadbl_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_rnadbl_gr_clst_res_*.png" + doc: | + Percentage of RNA doublets per cluster. + All cells; all resolutions. + PNG format. + + atacdbl_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_atacdbl_gr_clst_res_*.png" + doc: | + Percentage of ATAC doublets per cluster. + All cells; all resolutions. + PNG format. + + vrlpdbl_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_vrlpdbl_gr_clst_res_*.png" + doc: | + Percentage of RNA and ATAC doublets + per cluster. + All cells; all resolutions. PNG format. - umap_gr_ph_spl_idnt_plot_pdf: + qc_mtrcs_dnst_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_qc_mtrcs_dnst_gr_clst_res_*.png" + doc: | + Distribution of QC metrics per cell + colored by cluster. + All cells; all resolutions. + PNG format. + + umap_gr_ph_spl_idnt_plot_png: type: File? outputBinding: - glob: "*_umap_gr_ph_spl_idnt.pdf" + glob: "*_umap_gr_ph_spl_idnt.png" doc: | UMAP colored by cell cycle phase. Split by dataset; downsampled to the smallest dataset. - PDF format. + PNG format. cmp_gr_ph_spl_idnt_plot_png: type: File? @@ -455,16 +559,6 @@ outputs: dataset. PNG format - cmp_gr_ph_spl_idnt_plot_pdf: - type: File? - outputBinding: - glob: "*_cmp_gr_ph_spl_idnt.pdf" - doc: | - Composition plot colored by cell cycle phase. - Split by dataset; downsampled to the smallest - dataset. - PDF format - umap_gr_ph_spl_cnd_plot_png: type: File? outputBinding: @@ -476,17 +570,6 @@ outputs: the smallest group. PNG format. - umap_gr_ph_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_umap_gr_ph_spl_cnd.pdf" - doc: | - UMAP colored by cell cycle phase. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group. - PDF format. - cmp_gr_ph_spl_cnd_plot_png: type: File? outputBinding: @@ -498,17 +581,6 @@ outputs: the smallest group. PNG format. - cmp_gr_ph_spl_cnd_plot_pdf: - type: File? - outputBinding: - glob: "*_cmp_gr_ph_spl_cnd.pdf" - doc: | - Composition plot colored by cell cycle phase. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group. - PDF format. - umap_gr_clst_res_plot_png: type: - "null" @@ -521,18 +593,6 @@ outputs: All cells; all resolutions. PNG format - umap_gr_clst_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_umap_gr_clst_res_*.pdf" - doc: | - UMAP colored by cluster. - All cells; all resolutions. - PDF format - umap_gr_clst_spl_idnt_res_plot_png: type: - "null" @@ -546,19 +606,6 @@ outputs: smallest dataset; all resolutions. PNG format. - umap_gr_clst_spl_idnt_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_umap_gr_clst_spl_idnt_res_*.pdf" - doc: | - UMAP colored by cluster. - Split by dataset; downsampled to the - smallest dataset; all resolutions. - PDF format. - cmp_gr_clst_spl_idnt_res_plot_png: type: - "null" @@ -572,19 +619,6 @@ outputs: smallest dataset; all resolutions. PNG format. - cmp_gr_clst_spl_idnt_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_cmp_gr_clst_spl_idnt_res_*.pdf" - doc: | - Composition plot colored by cluster. - Split by dataset; downsampled to the - smallest dataset; all resolutions. - PDF format. - cmp_gr_idnt_spl_clst_res_plot_png: type: - "null" @@ -598,19 +632,6 @@ outputs: smallest dataset; all resolutions. PNG format. - cmp_gr_idnt_spl_clst_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_cmp_gr_idnt_spl_clst_res_*.pdf" - doc: | - Composition plot colored by dataset. - Split by cluster; downsampled to the - smallest dataset; all resolutions. - PDF format. - umap_gr_clst_spl_ph_res_plot_png: type: - "null" @@ -626,21 +647,6 @@ outputs: resolutions. PNG format. - umap_gr_clst_spl_ph_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_umap_gr_clst_spl_ph_res_*.pdf" - doc: | - UMAP colored by cluster. - Split by cell cycle phase; downsampled - to the smallest dataset (if multiple - datasets are analyzed jointly); all - resolutions. - PDF format. - cmp_gr_ph_spl_clst_res_plot_png: type: - "null" @@ -656,21 +662,6 @@ outputs: resolutions. PNG format - cmp_gr_ph_spl_clst_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_cmp_gr_ph_spl_clst_res_*.pdf" - doc: | - Composition plot colored by cell cycle phase. - Split by cell cycle phase; downsampled - to the smallest dataset (if multiple - datasets are analyzed jointly); all - resolutions. - PDF format - umap_gr_clst_spl_cnd_res_plot_png: type: - "null" @@ -685,20 +676,6 @@ outputs: the smallest group; all resolutions. PNG format. - umap_gr_clst_spl_cnd_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_umap_gr_clst_spl_cnd_res_*.pdf" - doc: | - UMAP colored by cluster. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group; all resolutions. - PDF format. - cmp_gr_clst_spl_cnd_res_plot_png: type: - "null" @@ -713,20 +690,6 @@ outputs: the smallest group; all resolutions. PNG format. - cmp_gr_clst_spl_cnd_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_cmp_gr_clst_spl_cnd_res_*.pdf" - doc: | - Composition plot colored by cluster. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group; all resolutions. - PDF format. - cmp_gr_cnd_spl_clst_res_plot_png: type: - "null" @@ -741,20 +704,6 @@ outputs: smallest group; all resolutions. PNG format. - cmp_gr_cnd_spl_clst_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_cmp_gr_cnd_spl_clst_res_*.pdf" - doc: | - Composition plot colored by grouping condition. - Split by cluster; first downsampled to the - smallest dataset, then downsampled to the - smallest group; all resolutions. - PDF format. - xpr_per_cell_plot_png: type: - "null" @@ -767,18 +716,6 @@ outputs: All genes of interest. PNG format. - xpr_per_cell_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_xpr_per_cell_[!sgnl_]*.pdf" - doc: | - UMAP colored by gene expression. - All genes of interest. - PDF format. - xpr_per_cell_sgnl_plot_png: type: - "null" @@ -791,18 +728,6 @@ outputs: All genes of interest. PNG format. - xpr_per_cell_sgnl_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_xpr_per_cell_sgnl_*.pdf" - doc: | - UMAP colored by gene expression density. - All genes of interest. - PDF format. - xpr_avg_res_plot_png: type: - "null" @@ -815,18 +740,6 @@ outputs: All resolutions. PNG format. - xpr_avg_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_xpr_avg_res_*.pdf" - doc: | - Average gene expression. - All resolutions. - PDF format. - xpr_dnst_res_plot_png: type: - "null" @@ -839,40 +752,39 @@ outputs: All genes of interest; all resolutions. PNG format. - xpr_dnst_res_plot_pdf: + xpr_htmp_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_dnst_res_*.pdf" + glob: "*_xpr_htmp_res_*.png" doc: | - Gene expression density. - All genes of interest; all resolutions. - PDF format. + Gene expression heatmap. + Top gene markers; all resolutions. + PNG format. - xpr_htmp_res_plot_png: + cvrg_res_plot_png: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_htmp_res_*.png" + glob: "*_cvrg_res_*.png" doc: | - Gene expression heatmap. - Top gene markers; all resolutions. + ATAC fragment coverage. + All genes of interest; all resolutions. PNG format. - xpr_htmp_res_plot_pdf: + all_plots_pdf: type: - "null" - type: array items: File outputBinding: - glob: "*_xpr_htmp_res_*.pdf" + glob: "*.pdf" doc: | - Gene expression heatmap. - Top gene markers; all resolutions. + All generated plots. PDF format. xpr_htmp_res_tsv: @@ -887,30 +799,6 @@ outputs: Top gene markers; all resolutions. TSV format. - cvrg_res_plot_png: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_cvrg_res_*.png" - doc: | - ATAC fragment coverage. - All genes of interest; all resolutions. - PNG format. - - cvrg_res_plot_pdf: - type: - - "null" - - type: array - items: File - outputBinding: - glob: "*_cvrg_res_*.pdf" - doc: | - ATAC fragment coverage. - All genes of interest; all resolutions. - PDF format. - gene_markers_tsv: type: File? outputBinding: diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index dbfde3e3..f662ae0e 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -174,6 +174,73 @@ inputs: outputs: + cell_cnts_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_atac_cluster/cell_cnts_gr_clst_res_plot_png + label: "Number of cells per cluster (all cells)" + doc: | + Number of cells per cluster. + All cells; all resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Number of cells per cluster (all cells)" + + qc_mtrcs_dnst_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_atac_cluster/qc_mtrcs_dnst_gr_clst_res_plot_png + label: "Distribution of QC metrics per cell colored by cluster (all cells)" + doc: | + Distribution of QC metrics per cell + colored by cluster. + All cells; all resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Distribution of QC metrics per cell colored by cluster (all cells)" + + tss_frgm_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_atac_cluster/tss_frgm_spl_clst_res_plot_png + label: "TSS enrichment score vs ATAC fragments in peaks per cell (split by cluster, all cells)" + doc: | + TSS enrichment score vs ATAC + fragments in peaks per cell. + Split by cluster; all cells; + all resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "TSS enrichment score vs ATAC fragments in peaks per cell (split by cluster, all cells)" + + atacdbl_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_atac_cluster/atacdbl_gr_clst_res_plot_png + label: "Percentage of ATAC doublets per cluster (all cells)" + doc: | + Percentage of ATAC doublets per cluster. + All cells; all resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Percentage of ATAC doublets per cluster (all cells)" + umap_gr_clst_res_plot_png: type: - "null" @@ -393,6 +460,10 @@ steps: source: threads valueFrom: $(parseInt(self)) out: + - cell_cnts_gr_clst_res_plot_png + - tss_frgm_spl_clst_res_plot_png + - atacdbl_gr_clst_res_plot_png + - qc_mtrcs_dnst_gr_clst_res_plot_png - umap_gr_clst_res_plot_png - slh_gr_clst_res_plot_png - umap_gr_clst_spl_idnt_res_plot_png @@ -400,15 +471,7 @@ steps: - umap_gr_clst_spl_cnd_res_plot_png - cmp_gr_clst_spl_cnd_res_plot_png - cvrg_res_plot_png - - umap_gr_clst_res_plot_pdf - - slh_gr_clst_res_plot_pdf - - umap_gr_clst_spl_idnt_res_plot_pdf - - cmp_gr_clst_spl_idnt_res_plot_pdf - - cmp_gr_idnt_spl_clst_res_plot_pdf - - umap_gr_clst_spl_cnd_res_plot_pdf - - cmp_gr_clst_spl_cnd_res_plot_pdf - - cmp_gr_cnd_spl_clst_res_plot_pdf - - cvrg_res_plot_pdf + - all_plots_pdf - peak_markers_tsv - ucsc_cb_html_data - ucsc_cb_html_file @@ -421,15 +484,7 @@ steps: in: input_files: source: - - sc_atac_cluster/umap_gr_clst_res_plot_pdf - - sc_atac_cluster/slh_gr_clst_res_plot_pdf - - sc_atac_cluster/umap_gr_clst_spl_idnt_res_plot_pdf - - sc_atac_cluster/cmp_gr_clst_spl_idnt_res_plot_pdf - - sc_atac_cluster/cmp_gr_idnt_spl_clst_res_plot_pdf - - sc_atac_cluster/umap_gr_clst_spl_cnd_res_plot_pdf - - sc_atac_cluster/cmp_gr_clst_spl_cnd_res_plot_pdf - - sc_atac_cluster/cmp_gr_cnd_spl_clst_res_plot_pdf - - sc_atac_cluster/cvrg_res_plot_pdf + - sc_atac_cluster/all_plots_pdf valueFrom: $(self.flat().filter(n => n)) folder_basename: default: "pdf_plots" diff --git a/workflows/sc-atac-filter.cwl b/workflows/sc-atac-filter.cwl index 3efb876d..aa352b61 100644 --- a/workflows/sc-atac-filter.cwl +++ b/workflows/sc-atac-filter.cwl @@ -353,9 +353,9 @@ outputs: tab: "Unfiltered" Caption: "QC metrics PCA (unfiltered, PC2/PC3)" - raw_cells_count_plot_png: + raw_cell_cnts_plot_png: type: File? - outputSource: sc_atac_filter/raw_cells_count_plot_png + outputSource: sc_atac_filter/raw_cell_cnts_plot_png label: "Number of cells per dataset (unfiltered)" doc: | Number of cells per dataset. @@ -542,9 +542,9 @@ outputs: tab: "Filtered" Caption: "QC metrics PCA (filtered, PC2/PC3)" - fltr_cells_count_plot_png: + fltr_cell_cnts_plot_png: type: File? - outputSource: sc_atac_filter/fltr_cells_count_plot_png + outputSource: sc_atac_filter/fltr_cell_cnts_plot_png label: "Number of cells per dataset (filtered)" doc: | Number of cells per dataset. @@ -827,7 +827,7 @@ steps: out: - raw_1_2_qc_mtrcs_pca_plot_png - raw_2_3_qc_mtrcs_pca_plot_png - - raw_cells_count_plot_png + - raw_cell_cnts_plot_png - raw_frgm_dnst_plot_png - raw_peak_dnst_plot_png - raw_blck_dnst_plot_png @@ -841,7 +841,7 @@ steps: - raw_blck_dnst_spl_cnd_plot_png - fltr_1_2_qc_mtrcs_pca_plot_png - fltr_2_3_qc_mtrcs_pca_plot_png - - fltr_cells_count_plot_png + - fltr_cell_cnts_plot_png - fltr_frgm_dnst_plot_png - fltr_peak_dnst_plot_png - fltr_blck_dnst_plot_png diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index 2d7ae677..df16696e 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -234,6 +234,126 @@ inputs: outputs: + cell_cnts_gr_ctyp_plot_png: + type: File? + outputSource: ctype_assign/cell_cnts_gr_ctyp_plot_png + label: "Number of cells per cell type (all cells)" + doc: | + Number of cells per cell type. + All cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Number of cells per cell type (all cells)" + + qc_mtrcs_dnst_gr_ctyp_plot_png: + type: File? + outputSource: ctype_assign/qc_mtrcs_dnst_gr_ctyp_plot_png + label: "Distribution of QC metrics per cell colored by cell type (all cells)" + doc: | + Distribution of QC metrics per cell + colored by cell type. + All cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Distribution of QC metrics per cell colored by cell type (all cells)" + + gene_umi_spl_ctyp_plot_png: + type: File? + outputSource: ctype_assign/gene_umi_spl_ctyp_plot_png + label: "Genes vs RNA reads per cell (split by cell type, all cells)" + doc: | + Genes vs RNA reads per cell. + Split by cell type; all cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Genes vs RNA reads per cell (split by cell type, all cells)" + + umi_mito_spl_ctyp_plot_png: + type: File? + outputSource: ctype_assign/umi_mito_spl_ctyp_plot_png + label: "RNA reads vs mitochondrial % per cell (split by cell type, all cells)" + doc: | + RNA reads vs mitochondrial % per cell. + Split by cell type; all cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "RNA reads vs mitochondrial % per cell (split by cell type, all cells)" + + tss_frgm_spl_ctyp_plot_png: + type: File? + outputSource: ctype_assign/tss_frgm_spl_ctyp_plot_png + label: "TSS enrichment score vs ATAC fragments in peaks per cell (split by cell type, all cells)" + doc: | + TSS enrichment score vs ATAC + fragments in peaks per cell. + Split by cell type; all cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "TSS enrichment score vs ATAC fragments in peaks per cell (split by cell type, all cells)" + + rna_atac_cnts_spl_ctyp_plot_png: + type: File? + outputSource: ctype_assign/rna_atac_cnts_spl_ctyp_plot_png + label: "RNA reads vs ATAC fragments in peaks per cell (split by cell type, all cells)" + doc: | + RNA reads vs ATAC fragments in peaks per cell. + Split by cell type; all cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "RNA reads vs ATAC fragments in peaks per cell (split by cell type, all cells)" + + rnadbl_gr_ctyp_plot_png: + type: File? + outputSource: ctype_assign/rnadbl_gr_ctyp_plot_png + label: "Percentage of RNA doublets per cell type (all cells)" + doc: | + Percentage of RNA doublets per cell type. + All cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Percentage of RNA doublets per cell type (all cells)" + + atacdbl_gr_ctyp_plot_png: + type: File? + outputSource: ctype_assign/atacdbl_gr_ctyp_plot_png + label: "Percentage of ATAC doublets per cell type (all cells)" + doc: | + Percentage of ATAC doublets per cell type. + All cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Percentage of ATAC doublets per cell type (all cells)" + + vrlpdbl_gr_ctyp_plot_png: + type: File? + outputSource: ctype_assign/vrlpdbl_gr_ctyp_plot_png + label: "Percentage of RNA and ATAC doublets per cell type (all cells)" + doc: | + Percentage of RNA and ATAC doublets + per cell type. + All cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Percentage of RNA and ATAC doublets per cell type (all cells)" + umap_gr_ctyp_plot_png: type: File? outputSource: ctype_assign/umap_gr_ctyp_plot_png @@ -640,6 +760,15 @@ steps: source: threads valueFrom: $(parseInt(self)) out: + - cell_cnts_gr_ctyp_plot_png + - gene_umi_spl_ctyp_plot_png + - umi_mito_spl_ctyp_plot_png + - rnadbl_gr_ctyp_plot_png + - tss_frgm_spl_ctyp_plot_png + - atacdbl_gr_ctyp_plot_png + - rna_atac_cnts_spl_ctyp_plot_png + - vrlpdbl_gr_ctyp_plot_png + - qc_mtrcs_dnst_gr_ctyp_plot_png - umap_gr_ctyp_plot_png - umap_gr_ctyp_spl_idnt_plot_png - cmp_gr_ctyp_spl_idnt_plot_png @@ -656,25 +785,7 @@ steps: - xpr_dnst_plot_png - xpr_htmp_plot_png - cvrg_plot_png - - umap_gr_ctyp_plot_pdf - - umap_gr_ctyp_spl_idnt_plot_pdf - - cmp_gr_ctyp_spl_idnt_plot_pdf - - cmp_gr_idnt_spl_ctyp_plot_pdf - - umap_gr_ph_spl_idnt_plot_pdf - - cmp_gr_ph_spl_idnt_plot_pdf - - umap_gr_ctyp_spl_ph_plot_pdf - - cmp_gr_ph_spl_ctyp_plot_pdf - - umap_gr_ctyp_spl_cnd_plot_pdf - - cmp_gr_ctyp_spl_cnd_plot_pdf - - cmp_gr_cnd_spl_ctyp_plot_pdf - - umap_gr_ph_spl_cnd_plot_pdf - - cmp_gr_ph_spl_cnd_plot_pdf - - xpr_avg_plot_pdf - - xpr_per_cell_plot_pdf - - xpr_per_cell_sgnl_plot_pdf - - xpr_dnst_plot_pdf - - xpr_htmp_plot_pdf - - cvrg_plot_pdf + - all_plots_pdf - xpr_htmp_tsv - gene_markers_tsv - peak_markers_tsv @@ -691,25 +802,7 @@ steps: in: input_files: source: - - ctype_assign/umap_gr_ctyp_plot_pdf - - ctype_assign/umap_gr_ctyp_spl_idnt_plot_pdf - - ctype_assign/cmp_gr_ctyp_spl_idnt_plot_pdf - - ctype_assign/cmp_gr_idnt_spl_ctyp_plot_pdf - - ctype_assign/umap_gr_ph_spl_idnt_plot_pdf - - ctype_assign/cmp_gr_ph_spl_idnt_plot_pdf - - ctype_assign/umap_gr_ctyp_spl_ph_plot_pdf - - ctype_assign/cmp_gr_ph_spl_ctyp_plot_pdf - - ctype_assign/umap_gr_ctyp_spl_cnd_plot_pdf - - ctype_assign/cmp_gr_ctyp_spl_cnd_plot_pdf - - ctype_assign/cmp_gr_cnd_spl_ctyp_plot_pdf - - ctype_assign/umap_gr_ph_spl_cnd_plot_pdf - - ctype_assign/cmp_gr_ph_spl_cnd_plot_pdf - - ctype_assign/xpr_avg_plot_pdf - - ctype_assign/xpr_per_cell_plot_pdf - - ctype_assign/xpr_per_cell_sgnl_plot_pdf - - ctype_assign/xpr_dnst_plot_pdf - - ctype_assign/xpr_htmp_plot_pdf - - ctype_assign/cvrg_plot_pdf + - ctype_assign/all_plots_pdf valueFrom: $(self.flat().filter(n => n)) folder_basename: default: "pdf_plots" diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index d103ff94..2fdd9ecd 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -508,9 +508,9 @@ outputs: tab: "Unfiltered" Caption: "QC metrics PCA (unfiltered, PC2/PC3)" - raw_cells_count_plot_png: + raw_cell_cnts_plot_png: type: File? - outputSource: sc_multiome_filter/raw_cells_count_plot_png + outputSource: sc_multiome_filter/raw_cell_cnts_plot_png label: "Number of cells per dataset (unfiltered)" doc: | Number of cells per dataset. @@ -870,9 +870,9 @@ outputs: tab: "Filtered" Caption: "QC metrics PCA (filtered, PC2/PC3)" - fltr_cells_count_plot_png: + fltr_cell_cnts_plot_png: type: File? - outputSource: sc_multiome_filter/fltr_cells_count_plot_png + outputSource: sc_multiome_filter/fltr_cell_cnts_plot_png label: "Number of cells per dataset (filtered)" doc: | Number of cells per dataset. @@ -1374,7 +1374,7 @@ steps: out: - raw_1_2_qc_mtrcs_pca_plot_png - raw_2_3_qc_mtrcs_pca_plot_png - - raw_cells_count_plot_png + - raw_cell_cnts_plot_png - raw_umi_dnst_plot_png - raw_gene_dnst_plot_png - raw_gene_umi_plot_png @@ -1401,7 +1401,7 @@ steps: - raw_blck_dnst_spl_cnd_plot_png - fltr_1_2_qc_mtrcs_pca_plot_png - fltr_2_3_qc_mtrcs_pca_plot_png - - fltr_cells_count_plot_png + - fltr_cell_cnts_plot_png - fltr_umi_dnst_plot_png - fltr_gene_dnst_plot_png - fltr_gene_umi_plot_png diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index ac9333ce..20ed1c40 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -163,6 +163,89 @@ inputs: outputs: + cell_cnts_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/cell_cnts_gr_clst_res_plot_png + label: "Number of cells per cluster (all cells)" + doc: | + Number of cells per cluster. + All cells; all resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Number of cells per cluster (all cells)" + + qc_mtrcs_dnst_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/qc_mtrcs_dnst_gr_clst_res_plot_png + label: "Distribution of QC metrics per cell colored by cluster (all cells)" + doc: | + Distribution of QC metrics per cell + colored by cluster. + All cells; all resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Distribution of QC metrics per cell colored by cluster (all cells)" + + gene_umi_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/gene_umi_spl_clst_res_plot_png + label: "Genes vs RNA reads per cell (split by cluster, all cells)" + doc: | + Genes vs RNA reads per cell. + Split by cluster; all cells; + all resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Genes vs RNA reads per cell (split by cluster, all cells)" + + umi_mito_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/umi_mito_spl_clst_res_plot_png + label: "RNA reads vs mitochondrial % per cell (split by cluster, all cells)" + doc: | + RNA reads vs mitochondrial % per cell. + Split by cluster; all cells; all + resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "RNA reads vs mitochondrial % per cell (split by cluster, all cells)" + + rnadbl_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/rnadbl_gr_clst_res_plot_png + label: "Percentage of RNA doublets per cluster (all cells)" + doc: | + Percentage of RNA doublets per cluster. + All cells; all resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Percentage of RNA doublets per cluster (all cells)" + umap_gr_clst_res_plot_png: type: - "null" @@ -555,6 +638,11 @@ steps: source: threads valueFrom: $(parseInt(self)) out: + - cell_cnts_gr_clst_res_plot_png + - gene_umi_spl_clst_res_plot_png + - umi_mito_spl_clst_res_plot_png + - rnadbl_gr_clst_res_plot_png + - qc_mtrcs_dnst_gr_clst_res_plot_png - umap_gr_ph_spl_idnt_plot_png - cmp_gr_ph_spl_idnt_plot_png - umap_gr_ph_spl_cnd_plot_png @@ -571,25 +659,7 @@ steps: - xpr_avg_res_plot_png - xpr_dnst_res_plot_png - xpr_htmp_res_plot_png - - umap_gr_ph_spl_idnt_plot_pdf - - cmp_gr_ph_spl_idnt_plot_pdf - - umap_gr_ph_spl_cnd_plot_pdf - - cmp_gr_ph_spl_cnd_plot_pdf - - umap_gr_clst_res_plot_pdf - - slh_gr_clst_res_plot_pdf - - umap_gr_clst_spl_idnt_res_plot_pdf - - cmp_gr_clst_spl_idnt_res_plot_pdf - - cmp_gr_idnt_spl_clst_res_plot_pdf - - umap_gr_clst_spl_ph_res_plot_pdf - - cmp_gr_ph_spl_clst_res_plot_pdf - - umap_gr_clst_spl_cnd_res_plot_pdf - - cmp_gr_clst_spl_cnd_res_plot_pdf - - cmp_gr_cnd_spl_clst_res_plot_pdf - - xpr_per_cell_plot_pdf - - xpr_per_cell_sgnl_plot_pdf - - xpr_avg_res_plot_pdf - - xpr_dnst_res_plot_pdf - - xpr_htmp_res_plot_pdf + - all_plots_pdf - xpr_htmp_res_tsv - gene_markers_tsv - ucsc_cb_html_data @@ -605,25 +675,7 @@ steps: in: input_files: source: - - sc_rna_cluster/umap_gr_ph_spl_idnt_plot_pdf - - sc_rna_cluster/cmp_gr_ph_spl_idnt_plot_pdf - - sc_rna_cluster/umap_gr_ph_spl_cnd_plot_pdf - - sc_rna_cluster/cmp_gr_ph_spl_cnd_plot_pdf - - sc_rna_cluster/umap_gr_clst_res_plot_pdf - - sc_rna_cluster/slh_gr_clst_res_plot_pdf - - sc_rna_cluster/umap_gr_clst_spl_idnt_res_plot_pdf - - sc_rna_cluster/cmp_gr_clst_spl_idnt_res_plot_pdf - - sc_rna_cluster/cmp_gr_idnt_spl_clst_res_plot_pdf - - sc_rna_cluster/umap_gr_clst_spl_ph_res_plot_pdf - - sc_rna_cluster/cmp_gr_ph_spl_clst_res_plot_pdf - - sc_rna_cluster/umap_gr_clst_spl_cnd_res_plot_pdf - - sc_rna_cluster/cmp_gr_clst_spl_cnd_res_plot_pdf - - sc_rna_cluster/cmp_gr_cnd_spl_clst_res_plot_pdf - - sc_rna_cluster/xpr_per_cell_plot_pdf - - sc_rna_cluster/xpr_per_cell_sgnl_plot_pdf - - sc_rna_cluster/xpr_avg_res_plot_pdf - - sc_rna_cluster/xpr_dnst_res_plot_pdf - - sc_rna_cluster/xpr_htmp_res_plot_pdf + - sc_rna_cluster/all_plots_pdf valueFrom: $(self.flat().filter(n => n)) folder_basename: default: "pdf_plots" diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index baa04ec6..8ef37599 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -320,9 +320,9 @@ outputs: tab: "Unfiltered" Caption: "QC metrics PCA (unfiltered, PC2/PC3)" - raw_cells_count_plot_png: + raw_cell_cnts_plot_png: type: File? - outputSource: sc_rna_filter/raw_cells_count_plot_png + outputSource: sc_rna_filter/raw_cell_cnts_plot_png label: "Number of cells per dataset (unfiltered)" doc: | Number of cells per dataset. @@ -518,9 +518,9 @@ outputs: tab: "Filtered" Caption: "QC metrics PCA (filtered, PC2/PC3)" - fltr_cells_count_plot_png: + fltr_cell_cnts_plot_png: type: File? - outputSource: sc_rna_filter/fltr_cells_count_plot_png + outputSource: sc_rna_filter/fltr_cell_cnts_plot_png label: "Number of cells per dataset (filtered)" doc: | Number of cells per dataset. @@ -809,7 +809,7 @@ steps: out: - raw_1_2_qc_mtrcs_pca_plot_png - raw_2_3_qc_mtrcs_pca_plot_png - - raw_cells_count_plot_png + - raw_cell_cnts_plot_png - raw_umi_dnst_plot_png - raw_gene_dnst_plot_png - raw_gene_umi_plot_png @@ -824,7 +824,7 @@ steps: - raw_nvlt_dnst_spl_cnd_plot_png - fltr_1_2_qc_mtrcs_pca_plot_png - fltr_2_3_qc_mtrcs_pca_plot_png - - fltr_cells_count_plot_png + - fltr_cell_cnts_plot_png - fltr_umi_dnst_plot_png - fltr_gene_dnst_plot_png - fltr_gene_umi_plot_png diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index 599988ee..871aa700 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -212,6 +212,156 @@ inputs: outputs: + cell_cnts_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/cell_cnts_gr_clst_res_plot_png + label: "Number of cells per cluster (all cells)" + doc: | + Number of cells per cluster. + All cells; all resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Number of cells per cluster (all cells)" + + qc_mtrcs_dnst_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/qc_mtrcs_dnst_gr_clst_res_plot_png + label: "Distribution of QC metrics per cell colored by cluster (all cells)" + doc: | + Distribution of QC metrics per cell + colored by cluster. + All cells; all resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Distribution of QC metrics per cell colored by cluster (all cells)" + + gene_umi_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/gene_umi_spl_clst_res_plot_png + label: "Genes vs RNA reads per cell (split by cluster, all cells)" + doc: | + Genes vs RNA reads per cell. + Split by cluster; all cells; + all resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Genes vs RNA reads per cell (split by cluster, all cells)" + + umi_mito_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/umi_mito_spl_clst_res_plot_png + label: "RNA reads vs mitochondrial % per cell (split by cluster, all cells)" + doc: | + RNA reads vs mitochondrial % per cell. + Split by cluster; all cells; all + resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "RNA reads vs mitochondrial % per cell (split by cluster, all cells)" + + tss_frgm_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/tss_frgm_spl_clst_res_plot_png + label: "TSS enrichment score vs ATAC fragments in peaks per cell (split by cluster, all cells)" + doc: | + TSS enrichment score vs ATAC + fragments in peaks per cell. + Split by cluster; all cells; + all resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "TSS enrichment score vs ATAC fragments in peaks per cell (split by cluster, all cells)" + + rna_atac_cnts_spl_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/rna_atac_cnts_spl_clst_res_plot_png + label: "RNA reads vs ATAC fragments in peaks per cell (split by cluster, all cells)" + doc: | + RNA reads vs ATAC fragments in peaks per cell. + Split by cluster; all cells; all resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "RNA reads vs ATAC fragments in peaks per cell (split by cluster, all cells)" + + rnadbl_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/rnadbl_gr_clst_res_plot_png + label: "Percentage of RNA doublets per cluster (all cells)" + doc: | + Percentage of RNA doublets per cluster. + All cells; all resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Percentage of RNA doublets per cluster (all cells)" + + atacdbl_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/atacdbl_gr_clst_res_plot_png + label: "Percentage of ATAC doublets per cluster (all cells)" + doc: | + Percentage of ATAC doublets per cluster. + All cells; all resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Percentage of ATAC doublets per cluster (all cells)" + + vrlpdbl_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/vrlpdbl_gr_clst_res_plot_png + label: "Percentage of RNA and ATAC doublets per cluster (all cells)" + doc: | + Percentage of RNA and ATAC doublets + per cluster. + All cells; all resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Percentage of RNA and ATAC doublets per cluster (all cells)" + umap_gr_clst_res_plot_png: type: - "null" @@ -625,6 +775,15 @@ steps: source: threads valueFrom: $(parseInt(self)) out: + - cell_cnts_gr_clst_res_plot_png + - gene_umi_spl_clst_res_plot_png + - umi_mito_spl_clst_res_plot_png + - rna_atac_cnts_spl_clst_res_plot_png + - tss_frgm_spl_clst_res_plot_png + - rnadbl_gr_clst_res_plot_png + - atacdbl_gr_clst_res_plot_png + - vrlpdbl_gr_clst_res_plot_png + - qc_mtrcs_dnst_gr_clst_res_plot_png - umap_gr_ph_spl_idnt_plot_png - cmp_gr_ph_spl_idnt_plot_png - umap_gr_ph_spl_cnd_plot_png @@ -641,25 +800,7 @@ steps: - xpr_dnst_res_plot_png - xpr_htmp_res_plot_png - cvrg_res_plot_png - - umap_gr_ph_spl_idnt_plot_pdf - - cmp_gr_ph_spl_idnt_plot_pdf - - umap_gr_ph_spl_cnd_plot_pdf - - cmp_gr_ph_spl_cnd_plot_pdf - - umap_gr_clst_res_plot_pdf - - umap_gr_clst_spl_idnt_res_plot_pdf - - cmp_gr_clst_spl_idnt_res_plot_pdf - - cmp_gr_idnt_spl_clst_res_plot_pdf - - umap_gr_clst_spl_ph_res_plot_pdf - - cmp_gr_ph_spl_clst_res_plot_pdf - - umap_gr_clst_spl_cnd_res_plot_pdf - - cmp_gr_clst_spl_cnd_res_plot_pdf - - cmp_gr_cnd_spl_clst_res_plot_pdf - - xpr_per_cell_plot_pdf - - xpr_per_cell_sgnl_plot_pdf - - xpr_avg_res_plot_pdf - - xpr_dnst_res_plot_pdf - - xpr_htmp_res_plot_pdf - - cvrg_res_plot_pdf + - all_plots_pdf - xpr_htmp_res_tsv - gene_markers_tsv - peak_markers_tsv @@ -676,25 +817,7 @@ steps: in: input_files: source: - - sc_wnn_cluster/umap_gr_ph_spl_idnt_plot_pdf - - sc_wnn_cluster/cmp_gr_ph_spl_idnt_plot_pdf - - sc_wnn_cluster/umap_gr_ph_spl_cnd_plot_pdf - - sc_wnn_cluster/cmp_gr_ph_spl_cnd_plot_pdf - - sc_wnn_cluster/umap_gr_clst_res_plot_pdf - - sc_wnn_cluster/umap_gr_clst_spl_idnt_res_plot_pdf - - sc_wnn_cluster/cmp_gr_clst_spl_idnt_res_plot_pdf - - sc_wnn_cluster/cmp_gr_idnt_spl_clst_res_plot_pdf - - sc_wnn_cluster/umap_gr_clst_spl_ph_res_plot_pdf - - sc_wnn_cluster/cmp_gr_ph_spl_clst_res_plot_pdf - - sc_wnn_cluster/umap_gr_clst_spl_cnd_res_plot_pdf - - sc_wnn_cluster/cmp_gr_clst_spl_cnd_res_plot_pdf - - sc_wnn_cluster/cmp_gr_cnd_spl_clst_res_plot_pdf - - sc_wnn_cluster/xpr_per_cell_plot_pdf - - sc_wnn_cluster/xpr_per_cell_sgnl_plot_pdf - - sc_wnn_cluster/xpr_avg_res_plot_pdf - - sc_wnn_cluster/xpr_dnst_res_plot_pdf - - sc_wnn_cluster/xpr_htmp_res_plot_pdf - - sc_wnn_cluster/cvrg_res_plot_pdf + - sc_wnn_cluster/all_plots_pdf valueFrom: $(self.flat().filter(n => n)) folder_basename: default: "pdf_plots" From 000d833d3132082e5b0925ca184b2e3d255b3b26 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 2 May 2024 15:58:24 -0400 Subject: [PATCH 137/162] Add regions of interest input to Souporcell RNA pipeline --- tools/souporcell.cwl | 333 +++++++++++++++++++++++++++++++++++ workflows/souporcell-rna.cwl | 11 ++ 2 files changed, 344 insertions(+) create mode 100644 tools/souporcell.cwl diff --git a/tools/souporcell.cwl b/tools/souporcell.cwl new file mode 100644 index 00000000..6dde9175 --- /dev/null +++ b/tools/souporcell.cwl @@ -0,0 +1,333 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/souporcell:v0.0.1 + + +inputs: + + possorted_genome_bam_bai: + type: File + secondaryFiles: + - .bai + inputBinding: + position: 5 + prefix: "--bam" + doc: | + Position-sorted indexed aligned to the reference genome and transcriptome gene + expression reads file annotated with barcode information in BAM+BAI format. + + barcodes_tsv_file: + type: File + inputBinding: + position: 6 + prefix: "--barcodes" + doc: | + Cellular barcodes TSV file from the filtered feature-barcode matrices folder + generated by Cell Ranger Count (if run ARC then use only GEX matrices) + + genome_fasta_file: + type: File + secondaryFiles: + - .fai + inputBinding: + position: 7 + prefix: "--fasta" + doc: | + Reference genome FASTA file + fai index file + + regions_bed_file: + type: File? + inputBinding: + position: 8 + prefix: "--regions" + doc: | + Regions of interest BED file to filter provided or remapped BAM file + + clusters_count: + type: int + inputBinding: + position: 9 + prefix: "--clusters" + doc: | + Number of clusters to detect (number of donors merged into one single-cell experiment) + + ploidy_count: + type: int? + inputBinding: + position: 10 + prefix: "--ploidy" + doc: | + Ploidy, must be 1 or 2 + Default: 2 + + min_alt: + type: int? + inputBinding: + position: 11 + prefix: "--min_alt" + doc: | + Min alt to use locus + Default: 10 + + min_ref: + type: int? + inputBinding: + position: 12 + prefix: "--min_ref" + doc: | + Min ref to use locus + Default: 10 + + max_loci: + type: int? + inputBinding: + position: 13 + prefix: "--max_loci" + doc: | + Max loci per cell, affects speed + Default: 2048 + + restarts_count: + type: int? + inputBinding: + position: 14 + prefix: "--restarts" + doc: | + Number of restarts in clustering, when there are > 12 + clusters we recommend increasing this to avoid local + minima + Default: 100 + + common_variants_vcf_file: + type: File? + inputBinding: + position: 15 + prefix: "--common_variants" + doc: | + Common variant loci or known variant loci vcf file, + must be made vs the same reference fasta + + known_genotypes_vcf_file: + type: File? + inputBinding: + position: 16 + prefix: "--known_genotypes" + doc: | + Known variants per clone in population vcf mode, must be .vcf + + known_genotypes_sample_names: + type: + - "null" + - string + - type: array + items: string + inputBinding: + position: 17 + prefix: "--known_genotypes_sample_names" + doc: | + Which samples in population vcf from known genotypes + option represent the donors in your sample + + skip_remap: + type: boolean? + default: false + inputBinding: + position: 18 + prefix: "--skip_remap" + valueFrom: $(self?"True":null) # when we return null --skip_remap prefix won't be used at all + doc: | + Don't remap with minimap2 (not recommended unless in + conjunction with --common_variants) + + no_umi: + type: boolean? + default: false + inputBinding: + position: 19 + prefix: "--no_umi" + valueFrom: $(self?"True":"False") # Souporcell expects word, not just boolean flag + doc: | + Set to True if your bam has no UMI tag, will + ignore/override --umi_tag + + umi_tag: + type: string? + inputBinding: + position: 20 + prefix: "--umi_tag" + doc: | + Set if your umi tag is not UB + + cell_tag: + type: string? + inputBinding: + position: 21 + prefix: "--cell_tag" + doc: | + Set if your cell barcode tag is not CB + + ignore_data_errors: + type: boolean? + inputBinding: + position: 22 + prefix: "--ignore" + doc: | + Set to True to ignore data error assertions + + threads: + type: int? + default: 1 + inputBinding: + position: 23 + prefix: "--threads" + doc: | + Max threads to use + Forced default: 1 + + +outputs: + + genotype_cluster_tsv_file: + type: File + outputBinding: + glob: "./souporcell/clusters.tsv" + doc: "Cellurar barcodes file clustered by genotype" + + genotype_cluster_vcf_file: + type: File + outputBinding: + glob: "./souporcell/cluster_genotypes.vcf" + doc: | + VCF file with genotypes for each cluster for each variant call. + Refer to http://software.broadinstitute.org/software/igv/viewing_vcf_files + for track description when displaying in IGV. + + ambient_rna_file: + type: File + outputBinding: + glob: "./souporcell/ambient_rna.txt" + doc: "Ambient RNA evaluation text file" + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["souporcell_pipeline.py", "--out_dir", "./souporcell"] + + +stdout: souporcell_stdout.log +stderr: souporcell_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Souporcell Cluster by Genotype" +s:name: "Souporcell Cluster by Genotype" +s:alternateName: "Souporcell: robust clustering of single-cell RNA-seq data by genotype without reference genotypes" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/souporcell.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Souporcell Cluster by Genotype + ============================== + + Souporcell: robust clustering of single-cell RNA-seq data by genotype without reference genotypes + + --out_dir - harcoded to ./souporcell + --threads - forced to the default 1, because it's required parameter + --aligner - not added to CWL inputs + + +s:about: | + usage: souporcell_pipeline.py [-h] -i BAM -b BARCODES [--regions REGIONS] -f FASTA -t THREADS -o OUT_DIR -k CLUSTERS + [-p PLOIDY] [--min_alt MIN_ALT] [--min_ref MIN_REF] [--max_loci MAX_LOCI] [--restarts RESTARTS] + [--common_variants COMMON_VARIANTS] [--known_genotypes KNOWN_GENOTYPES] + [--known_genotypes_sample_names KNOWN_GENOTYPES_SAMPLE_NAMES [KNOWN_GENOTYPES_SAMPLE_NAMES ...]] + [--skip_remap SKIP_REMAP] [--no_umi NO_UMI] [--umi_tag UMI_TAG] [--cell_tag CELL_TAG] + [--ignore IGNORE] [--aligner ALIGNER] + + single cell RNAseq mixed genotype clustering using sparse mixture model clustering. + + optional arguments: + -h, --help show this help message and exit + -i BAM, --bam BAM cellranger bam + -b BARCODES, --barcodes BARCODES + barcodes.tsv from cellranger + --regions REGIONS regions.bed file to optionally restrict searching for SNPs to only selected regions + -f FASTA, --fasta FASTA + reference fasta file + -t THREADS, --threads THREADS + max threads to use + -o OUT_DIR, --out_dir OUT_DIR + name of directory to place souporcell files + -k CLUSTERS, --clusters CLUSTERS + number cluster, tbd add easy way to run on a range of k + -p PLOIDY, --ploidy PLOIDY + ploidy, must be 1 or 2, default = 2 + --min_alt MIN_ALT min alt to use locus, default = 10. + --min_ref MIN_REF min ref to use locus, default = 10. + --max_loci MAX_LOCI max loci per cell, affects speed, default = 2048. + --restarts RESTARTS number of restarts in clustering, when there are > 12 clusters we recommend increasing this to avoid + local minima + --common_variants COMMON_VARIANTS + common variant loci or known variant loci vcf, must be vs same reference fasta + --known_genotypes KNOWN_GENOTYPES + known variants per clone in population vcf mode, must be .vcf right now we dont accept gzip or bcf + sorry + --known_genotypes_sample_names KNOWN_GENOTYPES_SAMPLE_NAMES [KNOWN_GENOTYPES_SAMPLE_NAMES ...] + which samples in population vcf from known genotypes option represent the donors in your sample + --skip_remap SKIP_REMAP + don't remap with minimap2 (not recommended unless in conjunction with --common_variants + --no_umi NO_UMI set to True if your bam has no UMI tag, will ignore/override --umi_tag + --umi_tag UMI_TAG set if your umi tag is not UB + --cell_tag CELL_TAG DOES NOT WORK, vartrix doesnt support this! set if your cell barcode tag is not CB + --ignore IGNORE set to True to ignore data error assertions + --aligner ALIGNER optionally change to HISAT2 if you have it installed, not included in singularity build \ No newline at end of file diff --git a/workflows/souporcell-rna.cwl b/workflows/souporcell-rna.cwl index b1b07ee9..d31ec701 100644 --- a/workflows/souporcell-rna.cwl +++ b/workflows/souporcell-rna.cwl @@ -71,6 +71,16 @@ inputs: barcode per line and do not include any header information. + regions_bed_file: + type: File? + label: "Selected regions (optional)" + doc: | + A BED file to optionally prefilter + reads from the provided BAM file. + If minimap2 remapping is not skipped, + filtering by regions will be done + after it. + ploidy_count: type: int? default: 2 @@ -268,6 +278,7 @@ steps: possorted_genome_bam_bai: possorted_genome_bam_bai barcodes_tsv_file: get_barcodes_tsv_file/barcodes_tsv_file genome_fasta_file: genome_fasta_file + regions_bed_file: regions_bed_file clusters_count: clusters_count ploidy_count: ploidy_count min_alt: min_alt From 48bb7286f37b13252ecf529c5ae721d726bc1bd2 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 6 May 2024 13:27:29 -0400 Subject: [PATCH 138/162] No reason to include Souporcell as it not finished yet --- tools/souporcell.cwl | 333 -------------------------------- workflows/souporcell-rna.cwl | 354 ----------------------------------- 2 files changed, 687 deletions(-) delete mode 100644 tools/souporcell.cwl delete mode 100644 workflows/souporcell-rna.cwl diff --git a/tools/souporcell.cwl b/tools/souporcell.cwl deleted file mode 100644 index 6dde9175..00000000 --- a/tools/souporcell.cwl +++ /dev/null @@ -1,333 +0,0 @@ -cwlVersion: v1.0 -class: CommandLineTool - - -requirements: -- class: InlineJavascriptRequirement - - -hints: -- class: DockerRequirement - dockerPull: biowardrobe2/souporcell:v0.0.1 - - -inputs: - - possorted_genome_bam_bai: - type: File - secondaryFiles: - - .bai - inputBinding: - position: 5 - prefix: "--bam" - doc: | - Position-sorted indexed aligned to the reference genome and transcriptome gene - expression reads file annotated with barcode information in BAM+BAI format. - - barcodes_tsv_file: - type: File - inputBinding: - position: 6 - prefix: "--barcodes" - doc: | - Cellular barcodes TSV file from the filtered feature-barcode matrices folder - generated by Cell Ranger Count (if run ARC then use only GEX matrices) - - genome_fasta_file: - type: File - secondaryFiles: - - .fai - inputBinding: - position: 7 - prefix: "--fasta" - doc: | - Reference genome FASTA file + fai index file - - regions_bed_file: - type: File? - inputBinding: - position: 8 - prefix: "--regions" - doc: | - Regions of interest BED file to filter provided or remapped BAM file - - clusters_count: - type: int - inputBinding: - position: 9 - prefix: "--clusters" - doc: | - Number of clusters to detect (number of donors merged into one single-cell experiment) - - ploidy_count: - type: int? - inputBinding: - position: 10 - prefix: "--ploidy" - doc: | - Ploidy, must be 1 or 2 - Default: 2 - - min_alt: - type: int? - inputBinding: - position: 11 - prefix: "--min_alt" - doc: | - Min alt to use locus - Default: 10 - - min_ref: - type: int? - inputBinding: - position: 12 - prefix: "--min_ref" - doc: | - Min ref to use locus - Default: 10 - - max_loci: - type: int? - inputBinding: - position: 13 - prefix: "--max_loci" - doc: | - Max loci per cell, affects speed - Default: 2048 - - restarts_count: - type: int? - inputBinding: - position: 14 - prefix: "--restarts" - doc: | - Number of restarts in clustering, when there are > 12 - clusters we recommend increasing this to avoid local - minima - Default: 100 - - common_variants_vcf_file: - type: File? - inputBinding: - position: 15 - prefix: "--common_variants" - doc: | - Common variant loci or known variant loci vcf file, - must be made vs the same reference fasta - - known_genotypes_vcf_file: - type: File? - inputBinding: - position: 16 - prefix: "--known_genotypes" - doc: | - Known variants per clone in population vcf mode, must be .vcf - - known_genotypes_sample_names: - type: - - "null" - - string - - type: array - items: string - inputBinding: - position: 17 - prefix: "--known_genotypes_sample_names" - doc: | - Which samples in population vcf from known genotypes - option represent the donors in your sample - - skip_remap: - type: boolean? - default: false - inputBinding: - position: 18 - prefix: "--skip_remap" - valueFrom: $(self?"True":null) # when we return null --skip_remap prefix won't be used at all - doc: | - Don't remap with minimap2 (not recommended unless in - conjunction with --common_variants) - - no_umi: - type: boolean? - default: false - inputBinding: - position: 19 - prefix: "--no_umi" - valueFrom: $(self?"True":"False") # Souporcell expects word, not just boolean flag - doc: | - Set to True if your bam has no UMI tag, will - ignore/override --umi_tag - - umi_tag: - type: string? - inputBinding: - position: 20 - prefix: "--umi_tag" - doc: | - Set if your umi tag is not UB - - cell_tag: - type: string? - inputBinding: - position: 21 - prefix: "--cell_tag" - doc: | - Set if your cell barcode tag is not CB - - ignore_data_errors: - type: boolean? - inputBinding: - position: 22 - prefix: "--ignore" - doc: | - Set to True to ignore data error assertions - - threads: - type: int? - default: 1 - inputBinding: - position: 23 - prefix: "--threads" - doc: | - Max threads to use - Forced default: 1 - - -outputs: - - genotype_cluster_tsv_file: - type: File - outputBinding: - glob: "./souporcell/clusters.tsv" - doc: "Cellurar barcodes file clustered by genotype" - - genotype_cluster_vcf_file: - type: File - outputBinding: - glob: "./souporcell/cluster_genotypes.vcf" - doc: | - VCF file with genotypes for each cluster for each variant call. - Refer to http://software.broadinstitute.org/software/igv/viewing_vcf_files - for track description when displaying in IGV. - - ambient_rna_file: - type: File - outputBinding: - glob: "./souporcell/ambient_rna.txt" - doc: "Ambient RNA evaluation text file" - - stdout_log: - type: stdout - - stderr_log: - type: stderr - - -baseCommand: ["souporcell_pipeline.py", "--out_dir", "./souporcell"] - - -stdout: souporcell_stdout.log -stderr: souporcell_stderr.log - - -$namespaces: - s: http://schema.org/ - -$schemas: -- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf - -label: "Souporcell Cluster by Genotype" -s:name: "Souporcell Cluster by Genotype" -s:alternateName: "Souporcell: robust clustering of single-cell RNA-seq data by genotype without reference genotypes" - -s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/souporcell.cwl -s:codeRepository: https://github.com/Barski-lab/workflows -s:license: http://www.apache.org/licenses/LICENSE-2.0 - -s:isPartOf: - class: s:CreativeWork - s:name: Common Workflow Language - s:url: http://commonwl.org/ - -s:creator: -- class: s:Organization - s:legalName: "Cincinnati Children's Hospital Medical Center" - s:location: - - class: s:PostalAddress - s:addressCountry: "USA" - s:addressLocality: "Cincinnati" - s:addressRegion: "OH" - s:postalCode: "45229" - s:streetAddress: "3333 Burnet Ave" - s:telephone: "+1(513)636-4200" - s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" - s:department: - - class: s:Organization - s:legalName: "Allergy and Immunology" - s:department: - - class: s:Organization - s:legalName: "Barski Research Lab" - s:member: - - class: s:Person - s:name: Michael Kotliar - s:email: mailto:misha.kotliar@gmail.com - s:sameAs: - - id: http://orcid.org/0000-0002-6486-3898 - - -doc: | - Souporcell Cluster by Genotype - ============================== - - Souporcell: robust clustering of single-cell RNA-seq data by genotype without reference genotypes - - --out_dir - harcoded to ./souporcell - --threads - forced to the default 1, because it's required parameter - --aligner - not added to CWL inputs - - -s:about: | - usage: souporcell_pipeline.py [-h] -i BAM -b BARCODES [--regions REGIONS] -f FASTA -t THREADS -o OUT_DIR -k CLUSTERS - [-p PLOIDY] [--min_alt MIN_ALT] [--min_ref MIN_REF] [--max_loci MAX_LOCI] [--restarts RESTARTS] - [--common_variants COMMON_VARIANTS] [--known_genotypes KNOWN_GENOTYPES] - [--known_genotypes_sample_names KNOWN_GENOTYPES_SAMPLE_NAMES [KNOWN_GENOTYPES_SAMPLE_NAMES ...]] - [--skip_remap SKIP_REMAP] [--no_umi NO_UMI] [--umi_tag UMI_TAG] [--cell_tag CELL_TAG] - [--ignore IGNORE] [--aligner ALIGNER] - - single cell RNAseq mixed genotype clustering using sparse mixture model clustering. - - optional arguments: - -h, --help show this help message and exit - -i BAM, --bam BAM cellranger bam - -b BARCODES, --barcodes BARCODES - barcodes.tsv from cellranger - --regions REGIONS regions.bed file to optionally restrict searching for SNPs to only selected regions - -f FASTA, --fasta FASTA - reference fasta file - -t THREADS, --threads THREADS - max threads to use - -o OUT_DIR, --out_dir OUT_DIR - name of directory to place souporcell files - -k CLUSTERS, --clusters CLUSTERS - number cluster, tbd add easy way to run on a range of k - -p PLOIDY, --ploidy PLOIDY - ploidy, must be 1 or 2, default = 2 - --min_alt MIN_ALT min alt to use locus, default = 10. - --min_ref MIN_REF min ref to use locus, default = 10. - --max_loci MAX_LOCI max loci per cell, affects speed, default = 2048. - --restarts RESTARTS number of restarts in clustering, when there are > 12 clusters we recommend increasing this to avoid - local minima - --common_variants COMMON_VARIANTS - common variant loci or known variant loci vcf, must be vs same reference fasta - --known_genotypes KNOWN_GENOTYPES - known variants per clone in population vcf mode, must be .vcf right now we dont accept gzip or bcf - sorry - --known_genotypes_sample_names KNOWN_GENOTYPES_SAMPLE_NAMES [KNOWN_GENOTYPES_SAMPLE_NAMES ...] - which samples in population vcf from known genotypes option represent the donors in your sample - --skip_remap SKIP_REMAP - don't remap with minimap2 (not recommended unless in conjunction with --common_variants - --no_umi NO_UMI set to True if your bam has no UMI tag, will ignore/override --umi_tag - --umi_tag UMI_TAG set if your umi tag is not UB - --cell_tag CELL_TAG DOES NOT WORK, vartrix doesnt support this! set if your cell barcode tag is not CB - --ignore IGNORE set to True to ignore data error assertions - --aligner ALIGNER optionally change to HISAT2 if you have it installed, not included in singularity build \ No newline at end of file diff --git a/workflows/souporcell-rna.cwl b/workflows/souporcell-rna.cwl deleted file mode 100644 index d31ec701..00000000 --- a/workflows/souporcell-rna.cwl +++ /dev/null @@ -1,354 +0,0 @@ -cwlVersion: v1.1 -class: Workflow - - -requirements: -- class: SubworkflowFeatureRequirement -- class: StepInputExpressionRequirement -- class: MultipleInputFeatureRequirement -- class: InlineJavascriptRequirement - expressionLib: - - var split_by_common_delim = function(line) { - function get_unique(value, index, self) { - return self.indexOf(value) === index && value != ""; - } - let splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; - return (splitted_line && !!splitted_line.length)?splitted_line:null; - }; - - -"sd:upstream": - sc_rnaseq_sample: - - "cellranger-multi.cwl" - - "single-cell-preprocess-cellranger.cwl" - - -inputs: - - alias: - type: string - label: "Analysis name" - sd:preview: - position: 1 - - possorted_genome_bam_bai: - type: File - secondaryFiles: - - .bai - label: "Cell Ranger Count RNA or RNA+VDJ Sample" - doc: | - Any Cell Ranger Count (RNA) or - Cell Ranger Count (RNA+VDJ) Sample - "sd:upstreamSource": "sc_rnaseq_sample/possorted_genome_bam_bai" - "sd:localLabel": true - - genome_fasta_file: - type: File - secondaryFiles: - - .fai - label: "Cell Ranger Count RNA or RNA+VDJ Sample" - "sd:upstreamSource": "sc_rnaseq_sample/genome_indices/genome_indices/fasta_output" - - filtered_feature_bc_matrix_folder: - type: File - label: "Cell Ranger Count RNA or RNA+VDJ Sample" - "sd:upstreamSource": "sc_rnaseq_sample/filtered_feature_bc_matrix_folder" - - clusters_count: - type: int - label: "Number of clusters to detect (number of donors merged into one single-cell experiment)" - doc: | - Number of clusters to detect (number of donors merged into one single-cell experiment) - - barcodes_data: - type: File? - label: "Selected cell barcodes (optional)" - doc: | - A TSV/CSV file to optionally prefilter - the single cell data by including only - the cells with the selected barcodes. - The provided file should have one cell - barcode per line and do not include any - header information. - - regions_bed_file: - type: File? - label: "Selected regions (optional)" - doc: | - A BED file to optionally prefilter - reads from the provided BAM file. - If minimap2 remapping is not skipped, - filtering by regions will be done - after it. - - ploidy_count: - type: int? - default: 2 - label: "Ploidy, must be 1 or 2" - doc: | - Ploidy, must be 1 or 2 - "sd:layout": - advanced: true - - min_alt: - type: int? - default: 10 - label: "Min alt to use locus" - doc: | - Min alt to use locus - "sd:layout": - advanced: true - - min_ref: - type: int? - default: 10 - label: "Min ref to use locus" - doc: | - Min ref to use locus - "sd:layout": - advanced: true - - max_loci: - type: int? - default: 2048 - label: "Max loci per cell, affects speed" - doc: | - Max loci per cell, affects speed - "sd:layout": - advanced: true - - restarts_count: - type: int? - default: 100 - label: "Number of restarts in clustering, when there are > 12 clusters we recommend increasing this to avoid local minima" - doc: | - Number of restarts in clustering, when there are > 12 - clusters we recommend increasing this to avoid local - minima - "sd:layout": - advanced: true - - known_genotypes_sample_names: - type: string? - label: "Which samples in population VCF from known genotypes option represent the donors in your sample" - doc: | - Which samples in population VCF from known genotypes - option represent the donors in your sample - "sd:layout": - advanced: true - - skip_remap: - type: boolean? - default: false - label: "Don't remap with minimap2 (not recommended unless Common variant loci VCF file was provided)" - doc: | - Don't remap with minimap2 (not recommended unless in - conjunction with --common_variants) - "sd:layout": - advanced: true - - ignore_data_errors: - type: boolean? - label: "Ignore data error assertions" - doc: | - Set to True to ignore data error assertions - "sd:layout": - advanced: true - - threads: - type: int? - default: 2 - label: "Threads number to use" - doc: | - Threads number - "sd:layout": - advanced: true - - common_variants_vcf_file: - type: File? - label: "Common variant loci or known variant loci VCF file" - doc: | - Common variant loci or known variant loci VCF file, - must be made vs the same reference fasta - "sd:layout": - advanced: true - - known_genotypes_vcf_file: - type: File? - label: "Known variants per clone in population VCF file" - doc: | - Known variants per clone in population VCF mode, must be .vcf - "sd:layout": - advanced: true - - -outputs: - - genotype_cluster_tsv_file: - type: File - outputSource: rna_souporcell/genotype_cluster_tsv_file - label: "Cellurar barcodes file clustered by genotype" - doc: | - Cellurar barcodes file clustered by genotype - "sd:visualPlugins": - - syncfusiongrid: - tab: "Genotypes" - Title: "Cells clustered by genotype" - - genotype_cluster_vcf_file: - type: File - outputSource: rna_souporcell/genotype_cluster_vcf_file - label: "VCF file with genotypes for each cluster for each variant call" - doc: | - VCF file with genotypes for each cluster for each variant call. - Refer to http://software.broadinstitute.org/software/igv/viewing_vcf_files - for track description when displaying in IGV. - - ambient_rna_file: - type: File - outputSource: rna_souporcell/ambient_rna_file - label: "Ambient RNA evaluation text file" - doc: | - Ambient RNA evaluation text file - - rna_souporcell_stdout_log: - type: File - outputSource: rna_souporcell/stdout_log - label: stdout log generated by souporcell - doc: | - stdout log generated by souporcell - - rna_souporcell_stderr_log: - type: File - outputSource: rna_souporcell/stderr_log - label: stderr log generated by souporcell - doc: | - stderr log generated by souporcell - - -steps: - - get_barcodes_tsv_file: - run: - cwlVersion: v1.1 - class: CommandLineTool - hints: - - class: DockerRequirement - dockerPull: biowardrobe2/scidap:v0.0.3 - inputs: - script: - type: string? - default: | - #!/bin/bash - tar xzf $0 - mv */barcodes.tsv.gz . - gunzip barcodes.tsv.gz - if [ -f "$1" ]; then - echo "Filter by user provided barcodes" - comm -12 --check-order <(sort barcodes.tsv) <(sort $1) > cell_barcodes.tsv - else - echo "Do not filter by user provided barcodes" - mv barcodes.tsv cell_barcodes.tsv - fi - inputBinding: - position: 5 - filtered_feature_bc_matrix_folder: - type: File - inputBinding: - position: 6 - barcodes_data: - type: File? - inputBinding: - position: 7 - outputs: - barcodes_tsv_file: - type: File - outputBinding: - glob: "cell_barcodes.tsv" - baseCommand: ["bash", "-c"] - in: - filtered_feature_bc_matrix_folder: filtered_feature_bc_matrix_folder - barcodes_data: barcodes_data - out: - - barcodes_tsv_file - - rna_souporcell: - run: ../tools/souporcell.cwl - in: - possorted_genome_bam_bai: possorted_genome_bam_bai - barcodes_tsv_file: get_barcodes_tsv_file/barcodes_tsv_file - genome_fasta_file: genome_fasta_file - regions_bed_file: regions_bed_file - clusters_count: clusters_count - ploidy_count: ploidy_count - min_alt: min_alt - min_ref: min_ref - max_loci: max_loci - restarts_count: restarts_count - common_variants_vcf_file: common_variants_vcf_file - known_genotypes_vcf_file: known_genotypes_vcf_file - known_genotypes_sample_names: - source: known_genotypes_sample_names - valueFrom: $(split_by_common_delim(self)) - skip_remap: skip_remap - ignore_data_errors: ignore_data_errors - threads: threads - out: - - genotype_cluster_tsv_file - - genotype_cluster_vcf_file - - ambient_rna_file - - stdout_log - - stderr_log - - -$namespaces: - s: http://schema.org/ - -$schemas: -- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf - - -label: "Souporcell Cluster by Genotype for RNA" -s:name: "Souporcell Cluster by Genotype for RNA" -s:alternateName: "Souporcell Cluster by Genotype for RNA" - -s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/souporcell-rna.cwl -s:codeRepository: https://github.com/Barski-lab/workflows-datirium -s:license: http://www.apache.org/licenses/LICENSE-2.0 - -s:isPartOf: - class: s:CreativeWork - s:name: Common Workflow Language - s:url: http://commonwl.org/ - -s:creator: -- class: s:Organization - s:legalName: "Cincinnati Children's Hospital Medical Center" - s:location: - - class: s:PostalAddress - s:addressCountry: "USA" - s:addressLocality: "Cincinnati" - s:addressRegion: "OH" - s:postalCode: "45229" - s:streetAddress: "3333 Burnet Ave" - s:telephone: "+1(513)636-4200" - s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" - s:department: - - class: s:Organization - s:legalName: "Allergy and Immunology" - s:department: - - class: s:Organization - s:legalName: "Barski Research Lab" - s:member: - - class: s:Person - s:name: Michael Kotliar - s:email: mailto:misha.kotliar@gmail.com - s:sameAs: - - id: http://orcid.org/0000-0002-6486-3898 - - -doc: | - Souporcell Cluster by Genotype for RNA - - Souporcell: robust clustering of single-cell data by - genotype without reference genotypes \ No newline at end of file From e0f3ceb30e5ef6f1102e05befa447b4f3ee53d0d Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 28 May 2024 15:53:46 -0400 Subject: [PATCH 139/162] Update sc-tools image to the latest (search by gene in scATAC) --- tools/sc-atac-cluster.cwl | 2 +- tools/sc-atac-coverage.cwl | 2 +- tools/sc-atac-dbinding.cwl | 2 +- tools/sc-atac-filter.cwl | 2 +- tools/sc-atac-reduce.cwl | 2 +- tools/sc-ctype-assign.cwl | 2 +- tools/sc-multiome-filter.cwl | 2 +- tools/sc-rna-cluster.cwl | 2 +- tools/sc-rna-da-cells.cwl | 2 +- tools/sc-rna-de-pseudobulk.cwl | 2 +- tools/sc-rna-filter.cwl | 2 +- tools/sc-rna-reduce.cwl | 2 +- tools/sc-rna-trajectory.cwl | 2 +- tools/sc-triangulate.cwl | 2 +- tools/sc-vdj-profile.cwl | 2 +- tools/sc-wnn-cluster.cwl | 2 +- 16 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index d6332e13..ecffd4b8 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.37 + dockerPull: biowardrobe2/sc-tools:v0.0.38 inputs: diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index 799743b4..59920c51 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.37 + dockerPull: biowardrobe2/sc-tools:v0.0.38 inputs: diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index 2de29ade..3f8dd891 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.37 + dockerPull: biowardrobe2/sc-tools:v0.0.38 inputs: diff --git a/tools/sc-atac-filter.cwl b/tools/sc-atac-filter.cwl index 95b574be..7e7bd475 100644 --- a/tools/sc-atac-filter.cwl +++ b/tools/sc-atac-filter.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.37 + dockerPull: biowardrobe2/sc-tools:v0.0.38 inputs: diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index 7018d905..c27f25e3 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.37 + dockerPull: biowardrobe2/sc-tools:v0.0.38 inputs: diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index e7498480..f1afb965 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.37 + dockerPull: biowardrobe2/sc-tools:v0.0.38 inputs: diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index 5e001ee7..6a1bf6ad 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.37 + dockerPull: biowardrobe2/sc-tools:v0.0.38 inputs: diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index 6aabc6d3..b42a75a7 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.37 + dockerPull: biowardrobe2/sc-tools:v0.0.38 inputs: diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index 33d130e5..e04f8a2a 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.37 + dockerPull: biowardrobe2/sc-tools:v0.0.38 inputs: diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index 0be3a3c4..e37b918f 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.37 + dockerPull: biowardrobe2/sc-tools:v0.0.38 inputs: diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index 5bb29826..fda371c2 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.37 + dockerPull: biowardrobe2/sc-tools:v0.0.38 inputs: diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index 316358ad..d0029f41 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.37 + dockerPull: biowardrobe2/sc-tools:v0.0.38 inputs: diff --git a/tools/sc-rna-trajectory.cwl b/tools/sc-rna-trajectory.cwl index 0c8f3189..946d15d7 100644 --- a/tools/sc-rna-trajectory.cwl +++ b/tools/sc-rna-trajectory.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.37 + dockerPull: biowardrobe2/sc-tools:v0.0.38 inputs: diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index 72f22d41..348aa131 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.37 + dockerPull: biowardrobe2/sc-tools:v0.0.38 inputs: diff --git a/tools/sc-vdj-profile.cwl b/tools/sc-vdj-profile.cwl index d5b541c0..7920d60c 100644 --- a/tools/sc-vdj-profile.cwl +++ b/tools/sc-vdj-profile.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.37 + dockerPull: biowardrobe2/sc-tools:v0.0.38 inputs: diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index b5cfb4a0..36cbcb30 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.37 + dockerPull: biowardrobe2/sc-tools:v0.0.38 inputs: From b71d0348eb9a37155bb2083b341ac176e66dd073 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 12 Jun 2024 10:15:37 -0400 Subject: [PATCH 140/162] Increased the default memory limit in R to 128G --- workflows/sc-atac-cluster.cwl | 2 +- workflows/sc-atac-coverage.cwl | 2 +- workflows/sc-atac-dbinding.cwl | 2 +- workflows/sc-atac-filter.cwl | 2 +- workflows/sc-atac-reduce.cwl | 2 +- workflows/sc-ctype-assign.cwl | 2 +- workflows/sc-multiome-filter.cwl | 2 +- workflows/sc-rna-cluster.cwl | 2 +- workflows/sc-rna-da-cells.cwl | 2 +- workflows/sc-rna-de-pseudobulk.cwl | 2 +- workflows/sc-rna-filter.cwl | 2 +- workflows/sc-rna-reduce.cwl | 2 +- workflows/sc-rna-trajectory.cwl | 2 +- workflows/sc-triangulate.cwl | 2 +- workflows/sc-vdj-profile.cwl | 2 +- workflows/sc-wnn-cluster.cwl | 2 +- 16 files changed, 16 insertions(+), 16 deletions(-) diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index f662ae0e..4ea595dd 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -455,7 +455,7 @@ steps: parallel_memory_limit: default: 32 vector_memory_limit: - default: 96 + default: 128 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-atac-coverage.cwl b/workflows/sc-atac-coverage.cwl index d83437f1..a2370049 100644 --- a/workflows/sc-atac-coverage.cwl +++ b/workflows/sc-atac-coverage.cwl @@ -235,7 +235,7 @@ steps: parallel_memory_limit: default: 32 vector_memory_limit: - default: 96 + default: 128 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-atac-dbinding.cwl b/workflows/sc-atac-dbinding.cwl index e98462bb..2ad3d97f 100644 --- a/workflows/sc-atac-dbinding.cwl +++ b/workflows/sc-atac-dbinding.cwl @@ -667,7 +667,7 @@ steps: parallel_memory_limit: default: 32 vector_memory_limit: - default: 96 + default: 128 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-atac-filter.cwl b/workflows/sc-atac-filter.cwl index aa352b61..ebc43d6f 100644 --- a/workflows/sc-atac-filter.cwl +++ b/workflows/sc-atac-filter.cwl @@ -820,7 +820,7 @@ steps: parallel_memory_limit: default: 32 vector_memory_limit: - default: 96 + default: 128 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index 685a46d8..fb880881 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -515,7 +515,7 @@ steps: parallel_memory_limit: default: 32 vector_memory_limit: - default: 96 + default: 128 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index df16696e..2365077b 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -755,7 +755,7 @@ steps: parallel_memory_limit: default: 32 vector_memory_limit: - default: 96 + default: 128 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index 2fdd9ecd..19b40da8 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -1367,7 +1367,7 @@ steps: parallel_memory_limit: default: 32 vector_memory_limit: - default: 96 + default: 128 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index 20ed1c40..1f1ada82 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -633,7 +633,7 @@ steps: parallel_memory_limit: default: 32 vector_memory_limit: - default: 96 + default: 128 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-rna-da-cells.cwl b/workflows/sc-rna-da-cells.cwl index a1ce99ad..a2fb734a 100644 --- a/workflows/sc-rna-da-cells.cwl +++ b/workflows/sc-rna-da-cells.cwl @@ -391,7 +391,7 @@ steps: parallel_memory_limit: default: 32 vector_memory_limit: - default: 96 + default: 128 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-rna-de-pseudobulk.cwl b/workflows/sc-rna-de-pseudobulk.cwl index 179e54a4..678b1ea1 100644 --- a/workflows/sc-rna-de-pseudobulk.cwl +++ b/workflows/sc-rna-de-pseudobulk.cwl @@ -647,7 +647,7 @@ steps: parallel_memory_limit: default: 32 vector_memory_limit: - default: 96 + default: 128 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index 8ef37599..e82914d6 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -802,7 +802,7 @@ steps: parallel_memory_limit: default: 32 vector_memory_limit: - default: 96 + default: 128 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 42f45ede..c67915d1 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -637,7 +637,7 @@ steps: parallel_memory_limit: default: 32 vector_memory_limit: - default: 96 + default: 128 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-rna-trajectory.cwl b/workflows/sc-rna-trajectory.cwl index 61d15c83..9e08eeba 100644 --- a/workflows/sc-rna-trajectory.cwl +++ b/workflows/sc-rna-trajectory.cwl @@ -516,7 +516,7 @@ steps: parallel_memory_limit: default: 32 vector_memory_limit: - default: 96 + default: 128 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-triangulate.cwl b/workflows/sc-triangulate.cwl index 636492d3..347c2906 100644 --- a/workflows/sc-triangulate.cwl +++ b/workflows/sc-triangulate.cwl @@ -324,7 +324,7 @@ steps: parallel_memory_limit: default: 32 vector_memory_limit: - default: 96 + default: 128 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-vdj-profile.cwl b/workflows/sc-vdj-profile.cwl index f149c852..26f3a1ad 100644 --- a/workflows/sc-vdj-profile.cwl +++ b/workflows/sc-vdj-profile.cwl @@ -620,7 +620,7 @@ steps: parallel_memory_limit: default: 32 vector_memory_limit: - default: 96 + default: 128 threads: source: threads valueFrom: $(parseInt(self)) diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index 871aa700..47645588 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -770,7 +770,7 @@ steps: parallel_memory_limit: default: 32 vector_memory_limit: - default: 96 + default: 128 threads: source: threads valueFrom: $(parseInt(self)) From 1972cb24dbc44c1769ee18b772a0e02e49a9ce5e Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 13 Jun 2024 15:26:03 -0400 Subject: [PATCH 141/162] Add MAnorm2 pipeline --- tools/manorm2.cwl | 745 +++++++++++++++++++++++ workflows/manorm2.cwl | 1301 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 2046 insertions(+) create mode 100644 tools/manorm2.cwl create mode 100644 workflows/manorm2.cwl diff --git a/tools/manorm2.cwl b/tools/manorm2.cwl new file mode 100644 index 00000000..09fc7a5f --- /dev/null +++ b/tools/manorm2.cwl @@ -0,0 +1,745 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/manorm2:v0.0.1 + + +inputs: + + read_files_cond_1: + type: + - File + - type: array + items: File + inputBinding: + prefix: "--read1" + doc: | + Coordinate sorted and indexed BAM files with + aligned reads from the samples that belong + to the first biological condition. + + read_files_cond_2: + type: + - File + - type: array + items: File + inputBinding: + prefix: "--read2" + doc: | + Coordinate sorted and indexed BAM files with + aligned reads from the samples that belong + to the second biological condition. + + peak_files_cond_1: + type: + - File + - type: array + items: File + inputBinding: + prefix: "--peak1" + doc: | + Narrow or broad peak files with the peaks + called from the samples that belong to the + first biological condition. + + peak_files_cond_2: + type: + - File + - type: array + items: File + inputBinding: + prefix: "--peak2" + doc: | + Narrow or broad peak files with the peaks + called from the samples that belong to the + second biological condition. + + sample_names_cond_1: + type: + - string + - type: array + items: string + inputBinding: + prefix: "--name1" + doc: | + Names of the samples that belong to the first + biological condition. All values from the --name1 + and --name2 parameters should be unique. + + sample_names_cond_2: + type: + - string + - type: array + items: string + inputBinding: + prefix: "--name2" + doc: | + Names of the samples that belong to the second + biological condition. All values from the --name1 + and --name2 parameters should be unique. + + summit_files_cond_1: + type: + - "null" + - File + - type: array + items: File + inputBinding: + prefix: "--summit1" + doc: | + BED files with the summits of the peaks called + from the samples that belong to the first + biological condition. If not provided, the peak + center is taken as the summit. + + summit_files_cond_2: + type: + - "null" + - File + - type: array + items: File + inputBinding: + prefix: "--summit2" + doc: | + BED files with the summits of the peaks called + from the samples that belong to the second + biological condition. If not provided, the peak + center is taken as the summit. + + condition_1: + type: + - "null" + - string + inputBinding: + prefix: "--condition1" + doc: | + Name for the first biological condition. The + direction of comparison is always --condition2 + vs --condition1. Default: control. + + condition_2: + type: + - "null" + - string + inputBinding: + prefix: "--condition2" + doc: | + Name for the second biological condition. The + direction of comparison is always --condition2 + vs --condition1. Default: treatment. + + minimum_overlap: + type: + - "null" + - float + inputBinding: + prefix: "--minoverlap" + doc: | + Filtering threshold to keep only those reference + genomic bins that are present in at least this + many samples within the biological condition. If + this threshold has a value between zero and one, + only those peaks will be included that are present in + at least this fraction of datasets. Default: 1 + + maximum_padj: + type: + - "null" + - float + inputBinding: + prefix: "--padj" + doc: | + Filtering threshold to report only differentially + bound sites with adjusted P-value less than or equal + to the provided value. Default: 0.05 + + batch_metadata_file: + type: + - "null" + - File + inputBinding: + prefix: "--batch" + doc: | + Optional headerless TSV/CSV file for batch effect + correction. First column should include values from + the --name1 and --name2 parameters in any order. + The second column should include group names to + which each sample should be assigned. Default: do + not apply batch correction. + + maximum_peak_number: + type: + - "null" + - int + inputBinding: + prefix: "--maxpeaks" + doc: | + The maximum number of the most significant peaks + to select from each peak file when constructing + reference genomic bins. The top significant peaks + are selected based on the score column which is + calculated by MACS2 either as int(-10*log10qvalue) + or as int(-10*log10qvalue), depending on the cutoff + used for peak calling. Default: keep all peaks. + + minimum_peak_gap: + type: + - "null" + - int + inputBinding: + prefix: "--minpeakgap" + doc: | + Peaks remained after optional filtering by --maxpeaks + parameter will be merged if the distance between them + is smaller than the provided value. Merging is first + applied per sample and then to all peaks together + before splitting them into the reference genomic bins + of size --binsize. Default: 150 + + bin_size: + type: + - "null" + - int + inputBinding: + prefix: "--binsize" + doc: | + The size of non-overlapping reference genomic bins used + for generating a table of read per peak counts. 2000 bp + is recommended for sharp histone marks like H3K4me3 and + H3K27ac, and 1000 bp for TFs or DNase-seq. Default: 2000 + + fixed_bin_size: + type: + - "null" + - boolean + inputBinding: + prefix: "--fixbinsize" + doc: | + Force all reference genomic bins be exaclty the + same size as provided in the --binsize parameter. + Default: when a merged peak is split into the + reference genomic bins, their sizes do not exceed + the boundaries of the original peak. + + blacklist_regions_file: + type: + - "null" + - File + inputBinding: + prefix: "--blacklist" + doc: | + BED file with the genomic blacklist regions. Any + reference genomic bin overlapping a blacklist region + will be removed from the analysis. Default: include + all reference genomic bins. + + remove_duplicated_reads: + type: + - "null" + - boolean + inputBinding: + prefix: "--dedup" + doc: | + Remove duplicated reads identified by their coordinates. + The location of a single-end read is determined by its + strand and 5' coordinate. For a paired-end read, the + DNA fragment position is used instead. Default: include + all reads. + + shift_size: + type: + - "null" + - int + inputBinding: + prefix: "--shiftsize" + doc: | + Shift the positions of the 5' ends of a single-end + reads downstream on the selected value. Use the resulting + points for counting reads in the reference genomic bins. + Set as half of the DNA fragment size. Ignored if --paired + parameter is provided. Default 100 + + paired_end: + type: + - "null" + - boolean + inputBinding: + prefix: "--paired" + doc: | + Consider all reads as paired-end. When counting reads + in the reference genomic bins, use the middle point of + each DNA fragment. --shiftsize parameters is ignored. + Default: treat all reads as single-end. + + exclude_chromosomes: + type: + - "null" + - string + - type: array + items: string + inputBinding: + prefix: "--exclude" + doc: | + Define the chromosomes to be exluded from the analysis. + Default: include all chromosomes + + normalization_method: + type: + - "null" + - type: enum + symbols: + - "pseudo-reference" + - "baseline" + - "hierarchical" + inputBinding: + prefix: "--norm" + doc: | + Normalization method applied to the raw read counts. + pseudo-reference - normalize each sample to the pseudo + reference that includes the average intesities from all + samples. A reference genomic bin is occupied by the + pseudo reference if it was occupied by at least one + sample that the reference was constructed from. Each + sample is MA-normalized to the pseudo reference using + the common genomic bins between the reference and a + sample. baseline - normalize each sample to the one + whose log2 size factor is closest to 0. hierarchical - + similar to the baseline but first all samples are + normalized within the biological conditions, than two + biological conditions are normalized between each other. + Default: pseudo-reference + + export_pdf_plots: + type: + - "null" + - boolean + inputBinding: + prefix: "--pdf" + doc: | + Export plots in PDF. Default: false + + output_prefix: + type: + - "null" + - string + inputBinding: + prefix: "--output" + doc: | + Output prefix. Default: ./manorm + + parallel_memory_limit: + type: + - "null" + - int + inputBinding: + prefix: "--memory" + doc: | + Maximum memory in GB allowed to be shared between + the workers when using multiple --cpus. + Default: 32 + + threads: + type: + - "null" + - int + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. Default: 1 + + seed: + type: + - "null" + - int + inputBinding: + prefix: "--seed" + doc: | + Seed number for random values. + Default: 42 + + +outputs: + + peak_profile_bins_xls: + type: File + outputBinding: + glob: "peak_profile_bins.xls" + doc: | + Read counts and enrichment status for + each deduced reference genomic bin in + each sample. + + diff_rgns_tsv: + type: File + outputBinding: + glob: "*_diff_rgns.tsv" + doc: | + Differentially bound sites, not filtered + by adjusted P-value threshold. + TSV format. + + smpl_corr_raw_plot_png: + type: + - "null" + - File + outputBinding: + glob: "*_smpl_corr_raw.png" + doc: | + Read counts correlation between the samples. + On the basis of the raw read counts within + the reference genomic bins. + PNG format. + + smpl_corr_crtd_plot_png: + type: + - "null" + - File + outputBinding: + glob: "*_smpl_corr_crtd.png" + doc: | + Read counts correlation between the samples. + On the basis of the batch corrected raw read + counts within the reference genomic bins + PNG format. + + smpl_corr_norm_plot_png: + type: + - "null" + - File + outputBinding: + glob: "*_smpl_corr_norm.png" + doc: | + Read counts correlation between the samples. + On the basis of the optionally batch corrected + normalized read counts within the reference + genomic bins. + PNG format. + + smpl_vrlp_plot_png: + type: + - "null" + - File + outputBinding: + glob: "*_smpl_vrlp.png" + doc: | + Peaks overlap between the samples. On the + basis of the occupied by each sample + reference genomic bins. + PNG format. + + cnd_vrlp_plot_png: + type: + - "null" + - File + outputBinding: + glob: "*_cnd_vrlp.png" + doc: | + Peaks overlap between the biological conditions. + On the basis of the occupied by each biological + condition reference genomic bins. + PNG format. + + ma_corr_plot_png: + type: + - "null" + - File + outputBinding: + glob: "*_ma_corr.png" + doc: | + Correlation between M and A values across the + common peak regions of either each pair of + biological conditions or each pair of samples. + PNG format. + + diff_vlcn_plot_png: + type: + - "null" + - File + outputBinding: + glob: "*_diff_vlcn.png" + doc: | + Volcano plot for differentially bound sites. + PNG format. + + diff_ma_plot_png: + type: + - "null" + - File + outputBinding: + glob: "*_diff_ma.png" + doc: | + MA-plot for differentially bound sites. + PNG format. + + pca_1_2_plot_png: + type: + - "null" + - File + outputBinding: + glob: "*_pca_1_2.png" + doc: | + Read counts PCA (PC1/PC2) + PNG format. + + pca_2_3_plot_png: + type: + - "null" + - File + outputBinding: + glob: "*_pca_2_3.png" + doc: | + Read counts PCA (PC2/PC3) + PNG format. + + mds_plot_html: + type: + - "null" + - File + outputBinding: + glob: "*_mds_plot.html" + doc: | + MDS plot of optionally batch corrected + normalized read counts within the + reference genomic bins. + HTML format. + + read_cnts_gct: + type: + - "null" + - File + outputBinding: + glob: "*_read_cnts.gct" + doc: | + Optionally batch corrected normalized + read counts within the reference + genomic bins. + GCT format + + all_plots_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*.pdf" + doc: | + All generated plots. + PDF format. + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["run_manorm2.R"] +stderr: manorm_stderr.log +stdout: manorm_stdout.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "MAnorm2 for Normalizing and Comparing ChIP-Seq/ATAC-Seq Samples" +s:name: "MAnorm2 for Normalizing and Comparing ChIP-Seq/ATAC-Seq Samples" +s:alternateName: "MAnorm2 for Normalizing and Comparing ChIP-Seq/ATAC-Seq Samples" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/manorm2.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + +doc: | + MAnorm2 for Normalizing and Comparing ChIP-Seq/ATAC-Seq Samples + +s:about: | + usage: run_manorm2.R [-h] --read1 READ1 [READ1 ...] --read2 + READ2 [READ2 ...] --peak1 PEAK1 + [PEAK1 ...] --peak2 PEAK2 [PEAK2 ...] + [--summit1 [SUMMIT1 ...]] + [--summit2 [SUMMIT2 ...]] --name1 NAME1 + [NAME1 ...] --name2 NAME2 [NAME2 ...] + [--condition1 CONDITION1] + [--condition2 CONDITION2] + [--minoverlap MINOVERLAP] [--padj PADJ] + [--batch BATCH] [--maxpeaks MAXPEAKS] + [--minpeakgap MINPEAKGAP] + [--binsize BINSIZE] [--fixbinsize] + [--blacklist BLACKLIST] [--dedup] + [--shiftsize SHIFTSIZE] [--paired] + [--exclude [EXCLUDE ...]] + [--norm {pseudo-reference,baseline,hierarchical}] + [--pdf] [--output OUTPUT] [--cpus CPUS] + [--memory MEMORY] [--tmpdir TMPDIR] + [--seed SEED] + + MAnorm2 for Normalizing and Comparing ChIP-seq Samples + + options: + -h, --help show this help message and exit + --read1 READ1 [READ1 ...] + Coordinate sorted and indexed BAM files with aligned + reads from the samples that belong to the first + biological condition. + --read2 READ2 [READ2 ...] + Coordinate sorted and indexed BAM files with aligned + reads from the samples that belong to the second + biological condition. + --peak1 PEAK1 [PEAK1 ...] + Narrow or broad peak files with the peaks called from + the samples that belong to the first biological + condition. + --peak2 PEAK2 [PEAK2 ...] + Narrow or broad peak files with the peaks called from + the samples that belong to the second biological + condition. + --summit1 [SUMMIT1 ...] + BED files with the summits of the peaks called from + the samples that belong to the first biological + condition. If not provided, the peak center is taken + as the summit. + --summit2 [SUMMIT2 ...] + BED files with the summits of the peaks called from + the samples that belong to the second biological + condition. If not provided, the peak center is taken + as the summit. + --name1 NAME1 [NAME1 ...] + Names of the samples that belong to the first + biological condition. All values from the --name1 and + --name2 parameters should be unique. + --name2 NAME2 [NAME2 ...] + Names of the samples that belong to the second + biological condition. All values from the --name1 and + --name2 parameters should be unique. + --condition1 CONDITION1 + Name for the first biological condition. The direction + of comparison is always --condition2 vs --condition1. + Default: control. + --condition2 CONDITION2 + Name for the second biological condition. The + direction of comparison is always --condition2 vs + --condition1. Default: treatment. + --minoverlap MINOVERLAP + Filtering threshold to keep only those reference + genomic bins that are present in at least this many + samples within the biological condition. If this + threshold has a value between zero and one, only those + peaks will be included that are present in at least + this fraction of datasets. Default: 1 + --padj PADJ Filtering threshold to report only differentially + bound sites with adjusted P-value less than or equal + to the provided value. Default: 0.05 + --batch BATCH Optional headerless TSV/CSV file for batch effect + correction. First column should include values from + the --name1 and --name2 parameters in any order. The + second column should include group names to which each + sample should be assigned. Default: do not apply batch + correction. + --maxpeaks MAXPEAKS The maximum number of the most significant peaks to + select from each peak file when constructing reference + genomic bins. The top significant peaks are selected + based on the score column which is calculated by MACS2 + either as int(-10*log10qvalue) or as + int(-10*log10qvalue), depending on the cutoff used for + peak calling. Default: keep all peaks. + --minpeakgap MINPEAKGAP + Peaks remained after optional filtering by --maxpeaks + parameter will be merged if the distance between them + is smaller than the provided value. Merging is first + applied per sample and then to all peaks together + before splitting them into the reference genomic bins + of size --binsize. Default: 150 + --binsize BINSIZE The size of non-overlapping reference genomic bins + used for generating a table of read per peak counts. + 2000 bp is recommended for sharp histone marks like + H3K4me3 and H3K27ac, and 1000 bp for TFs or DNase-seq. + Default: 2000 + --fixbinsize Force all reference genomic bins be exaclty the same + size as provided in the --binsize parameter. Default: + when a merged peak is split into the reference genomic + bins, their sizes do not exceed the boundaries of the + original peak. + --blacklist BLACKLIST + BED file with the genomic blacklist regions. Any + reference genomic bin overlapping a blacklist region + will be removed from the analysis. Default: include + all reference genomic bins. + --dedup Remove duplicated reads identified by their + coordinates. The location of a single-end read is + determined by its strand and 5' coordinate. For a + paired-end read, the DNA fragment position is used + instead. Default: include all reads. + --shiftsize SHIFTSIZE + Shift the positions of the 5' ends of a single-end + reads downstream on the selected value. Use the + resulting points for counting reads in the reference + genomic bins. Set as half of the DNA fragment size. + Ignored if --paired parameter is provided. Default 100 + --paired Consider all reads as paired-end. When counting reads + in the reference genomic bins, use the middle point of + each DNA fragment. --shiftsize parameters is ignored. + Default: treat all reads as single-end. + --exclude [EXCLUDE ...] + Define the chromosomes to be exluded from the + analysis. Default: include all chromosomes + --norm {pseudo-reference,baseline,hierarchical} + Normalization method applied to the raw read counts. + pseudo-reference - normalize each sample to the pseudo + reference that includes the average intesities from + all samples. A reference genomic bin is occupied by + the pseudo reference if it was occupied by at least + one sample that the reference was constructed from. + Each sample is MA-normalized to the pseudo reference + using the common genomic bins between the reference + and a sample. baseline - normalize each sample to the + one whose log2 size factor is closest to 0. + hierarchical - similar to the baseline but first all + samples are normalized within the biological + conditions, than two biological conditions are + normalized between each other. Default: pseudo- + reference + --pdf Export plots in PDF. Default: false + --output OUTPUT Output prefix. Default: ./manorm + --cpus CPUS Number of cores/cpus to use. Default: 1 + --memory MEMORY Maximum memory in GB allowed to be shared between the + workers when using multiple --cpus. Default: 32 + --tmpdir TMPDIR Directory to keep temporary files. Default: either + /tmp or defined by environment variables TMPDIR, TMP, + TEMP. + --seed SEED Seed number for random values. Default: 42 diff --git a/workflows/manorm2.cwl b/workflows/manorm2.cwl new file mode 100644 index 00000000..2bcd019c --- /dev/null +++ b/workflows/manorm2.cwl @@ -0,0 +1,1301 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var split_features = function(line) { + function get_unique(value, index, self) { + return self.indexOf(value) === index && value != ""; + } + let splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + + +"sd:upstream": + control: + - "trim-chipseq-se.cwl" + - "trim-chipseq-pe.cwl" + - "trim-atacseq-se.cwl" + - "trim-atacseq-pe.cwl" + treatment: + - "trim-chipseq-se.cwl" + - "trim-chipseq-pe.cwl" + - "trim-atacseq-se.cwl" + - "trim-atacseq-pe.cwl" + genome_indices: + - "genome-indices.cwl" + + +inputs: + + alias: + type: string + label: "Experiment short name" + sd:preview: + position: 1 + + read_files_cond_1: + type: File[] + label: "Control group sample(s)" + doc: | + Coordinate sorted and indexed BAM files with + aligned reads from the samples that belong + to the control group. + "sd:upstreamSource": "control/bambai_pair" + "sd:localLabel": true + + read_files_cond_2: + type: File[] + label: "Treatment group sample(s)" + doc: | + Coordinate sorted and indexed BAM files with + aligned reads from the samples that belong + to the treatment group. + "sd:upstreamSource": "treatment/bambai_pair" + "sd:localLabel": true + + narrow_peak_files_cond_1: + type: + - "null" + - File[] + label: "Control group sample(s)" + doc: | + Narrow peak files with the peaks called from + the samples that belong to the control group. + "sd:upstreamSource": "control/macs2_narrow_peaks" + + narrow_peak_files_cond_2: + type: + - "null" + - File[] + label: "Treatment group sample(s)" + doc: | + Narrow peak files with the peaks called from + the samples that belong to the treatment group. + "sd:upstreamSource": "treatment/macs2_narrow_peaks" + + broad_peak_files_cond_1: + type: + - "null" + - File[] + label: "Control group sample(s)" + doc: | + Broad peak files with the peaks called from + the samples that belong to the control group. + "sd:upstreamSource": "control/macs2_broad_peaks" + + broad_peak_files_cond_2: + type: + - "null" + - File[] + label: "Treatment group sample(s)" + doc: | + Broad peak files with the peaks called from + the samples that belong to the treatment group. + "sd:upstreamSource": "treatment/macs2_broad_peaks" + + genome_cov_files_cond_1: + type: File[] + label: "Control group sample(s)" + doc: | + Genome coverage files with the normalized + number of fragments at each base from the + samples that belong to the control group. + "sd:upstreamSource": "control/bigwig" + + genome_cov_files_cond_2: + type: File[] + label: "Treatment group sample(s)" + doc: | + Genome coverage files with the normalized + number of fragments at each base from the + samples that belong to the treatment group. + "sd:upstreamSource": "treatment/bigwig" + + summit_files_cond_1: + type: + - "null" + - File[] + label: "Control group sample(s)" + doc: | + BED files with the summits of the peaks + called from the samples that belong to + the control group. If not provided, the + peak center is taken as the summit. + "sd:upstreamSource": "control/macs2_peak_summits" + + summit_files_cond_2: + type: + - "null" + - File[] + label: "Treatment group sample(s)" + doc: | + BED files with the summits of the peaks + called from the samples that belong to + the treatment group. If not provided, + the peak center is taken as the summit. + "sd:upstreamSource": "treatment/macs2_peak_summits" + + sample_names_cond_1: + type: string[] + label: "Control group sample(s)" + doc: | + Names of the samples that belong to + the control group. + "sd:upstreamSource": "control/alias" + + sample_names_cond_2: + type: string[] + label: "Treatment group sample(s)" + doc: | + Names of the samples that belong to + the treatment group. + "sd:upstreamSource": "treatment/alias" + + condition_1: + type: string? + default: "control" + label: "Control group sample(s)" + doc: | + Alternative name for the + control group. + + condition_2: + type: string? + default: "treatment" + label: "Treatment group sample(s)" + doc: | + Alternative name for the + treatment group. + + paired_end: + type: boolean? + default: false + label: "Consider all reads as paired-end" + doc: | + Consider all reads as paired-end. When + counting reads in the reference genomic + bins, use the middle point of each DNA + fragment. When running in the paired-end + mode the reads positions are not shifted. + Default: treat all reads as single-end. + + annotation_file: + type: File + label: "Genome type" + doc: | + Genome annotation file for + the nearest genes assignment. + "sd:upstreamSource": "genome_indices/annotation" + + chrom_length_file: + type: File + label: "Genome type" + doc: | + Chromosome length file for + generating bigBed tracks. + "sd:upstreamSource": "genome_indices/chrom_length" + + maximum_peak_number: + type: int? + default: 0 + label: "Maximum number of peaks (optional)" + doc: | + The maximum number of the most significant + peaks to select from each peak file when + constructing reference genomic bins. The + top significant peaks are selected based + on the score column which is calculated + by MACS2 either as int(-10*log10qvalue) or + as int(-10*log10qvalue), depending on the + cutoff used for peak calling. + Default: 0 - keep all peaks. + + minimum_peak_gap: + type: int? + default: 150 + label: "Minimum peak gap" + doc: | + Peaks remained after optional filtering by + the maximum number of the most significant + peaks will be merged if the distance between + them is smaller than the provided value. + Merging is first applied per sample and then + to all peaks together before splitting them + into the reference genomic bins of a selected + size. Default: 150 + + normalization_method: + type: + - "null" + - type: enum + symbols: + - "pseudo-reference" + - "baseline" + - "hierarchical" + default: "pseudo-reference" + label: "Normalization method" + doc: | + Normalization method applied to the + raw read counts. pseudo-reference - + normalize each sample to the pseudo + reference that includes the average + intesities from all samples. A + reference genomic bin is occupied by + the pseudo reference if it was occupied + by at least one sample that the reference + was constructed from. Each sample is + MA-normalized to the pseudo reference + using the common genomic bins between + the reference and a sample. baseline - + normalize each sample to the one whose + log2 size factor is closest to 0. + hierarchical - similar to the baseline + but first all samples are normalized + within their groups, and than two groups + are normalized between each other. + Default: pseudo-reference + + maximum_padj: + type: float? + default: 0.05 + label: "Maximum adjusted P-value" + doc: | + Filtering threshold to report only + differentially bound sites with the + adjusted P-value less than or equal + to the provided value. Default: 0.05 + + batch_metadata_file: + type: File? + label: "Batch metadata file (optional)" + doc: | + Optional headerless TSV/CSV file for + batch effect correction. First column + should include sample names from the + both control and treatment groups in + an arbitrary order. The second column + should include categories to which + each sample should be assigned. + Default: do not apply batch correction. + + blacklist_regions_file: + type: File? + label: "Genomic blacklist regions file (optional)" + doc: | + BED file with the genomic blacklist + regions. Any reference genomic bin + overlapping a blacklist region will + be removed from the analysis. + Default: include all reference + genomic bins. + + remove_duplicated_reads: + type: boolean? + default: false + label: "Remove duplicated reads" + doc: | + Remove duplicated reads identified by + their coordinates. The location of a + single-end read is determined by its + strand and 5' coordinate. For a paired- + end read, the DNA fragment position is + used instead. Default: include all reads. + "sd:layout": + advanced: true + + bin_size: + type: int? + default: 2000 + label: "The size of the reference genomic bins" + doc: | + The size of non-overlapping reference + genomic bins used for generating a table + of read per peak counts. 2000 bp is + recommended for sharp histone marks like + H3K4me3 and H3K27ac, and 1000 bp for TFs + or DNase-seq. Default: 2000 + "sd:layout": + advanced: true + + fixed_bin_size: + type: boolean? + default: false + label: "Force equal size for all reference genomic bins" + doc: | + Force all reference genomic bins be + exaclty the same size (as selected). + Default: when a merged peak is split + into the reference genomic bins, their + sizes do not exceed the boundaries of + the original peak. + "sd:layout": + advanced: true + + shift_size: + type: int? + default: 100 + label: "Shift size for single-end reads" + doc: | + Shift the positions of the 5' ends + of a single-end reads downstream on + the selected value. Use the resulting + points for counting reads in the + reference genomic bins. Set as half + of the DNA fragment size. Ignored if + analysis is run in the paired-end mode. + Default 100 + "sd:layout": + advanced: true + + exclude_chromosomes: + type: string? + default: null + label: "Chromosomes to be exluded from the analysis" + doc: | + A comma- or space-separated list of + chromosomes to be exluded from the + analysis. Default: include all + chromosomes + "sd:layout": + advanced: true + + minimum_overlap: + type: float? + default: 1 + label: "Minimum peak overlap between the samples" + doc: | + Filtering threshold to keep only those + reference genomic bins that are present + in at least this many samples within + each group. If this threshold has a value + between zero and one, only those peaks + will be included that are present in at + least this fraction of samples. + Default: 1 + "sd:layout": + advanced: true + + promoter_dist: + type: int? + default: 1000 + label: "Promoter distance, bp" + doc: | + Max distance from the gene TSS (in + both direction) overlapping which + the differentially bound site will + be assigned to the promoter region. + Default: 1000 bp + "sd:layout": + advanced: true + + upstream_dist: + type: int? + default: 20000 + label: "Upstream distance, bp" + doc: | + Max distance from the promoter (only + in upstream direction) overlapping + which the differentially bound site + will be assigned to the upstream region. + Default: 20,000 bp + "sd:layout": + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + - "2" + - "3" + - "4" + - "5" + - "6" + default: "4" + label: "Cores/CPUs" + doc: | + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 4 + "sd:layout": + advanced: true + + +outputs: + + peak_profile_bins_xls: + type: File + outputSource: manorm/peak_profile_bins_xls + label: "Read counts and enrichment status in each sample" + doc: | + Read counts and enrichment status for + each deduced reference genomic bin in + each sample. + + coverage_files_cond_1: + type: File[] + label: "Control group sample(s)" + doc: | + Genome coverage files with the normalized + number of fragments at each base from the + samples that belong to the control group. + outputSource: pipe/coverage_files_cond_1 + "sd:visualPlugins": + - igvbrowser: + tab: "Genome Browser" + id: "igvbrowser" + type: "wig" + name: "Coverage (control)" + height: 120 + + coverage_files_cond_2: + type: File[] + label: "Treatment group sample(s)" + doc: | + Genome coverage files with the normalized + number of fragments at each base from the + samples that belong to the treatment group. + outputSource: pipe/coverage_files_cond_2 + "sd:visualPlugins": + - igvbrowser: + tab: "Genome Browser" + id: "igvbrowser" + type: "wig" + name: "Coverage (treatment)" + height: 120 + + n_peak_files_cond_1: + type: + - "null" + - File[] + label: "Control group sample(s)" + doc: | + Narrow peak files with the peaks called + from the samples that belong to the + control group. + outputSource: pipe/n_peak_files_cond_1 + "sd:visualPlugins": + - igvbrowser: + tab: "Genome Browser" + id: "igvbrowser" + type: "annotation" + name: "Peaks (control)" + displayMode: "COLLAPSE" + height: 40 + + n_peak_files_cond_2: + type: + - "null" + - File[] + label: "Treatment group sample(s)" + doc: | + Narrow peak files with the peaks called + from the samples that belong to the + treatment group. + outputSource: pipe/n_peak_files_cond_2 + "sd:visualPlugins": + - igvbrowser: + tab: "Genome Browser" + id: "igvbrowser" + type: "annotation" + name: "Peaks (treatment)" + displayMode: "COLLAPSE" + height: 40 + + b_peak_files_cond_1: + type: + - "null" + - File[] + label: "Control group sample(s)" + doc: | + Broad peak files with the peaks called from + the samples that belong to the control group. + outputSource: pipe/b_peak_files_cond_1 + "sd:visualPlugins": + - igvbrowser: + tab: "Genome Browser" + id: "igvbrowser" + type: "annotation" + name: "Peaks (control)" + displayMode: "COLLAPSE" + height: 40 + + b_peak_files_cond_2: + type: + - "null" + - File[] + label: "Treatment group sample(s)" + doc: | + Broad peak files with the peaks called from + the samples that belong to the treatment group. + outputSource: pipe/b_peak_files_cond_2 + "sd:visualPlugins": + - igvbrowser: + tab: "Genome Browser" + id: "igvbrowser" + type: "annotation" + name: "Peaks (treatment)" + displayMode: "COLLAPSE" + height: 40 + + diff_rgns_bigbed: + type: File + label: "Differentially bound sites (bigBed format)" + doc: | + Differentially bound sites. + bigBed format. + outputSource: bed_to_bigbed/bigbed_file + "sd:visualPlugins": + - igvbrowser: + tab: "Genome Browser" + id: "igvbrowser" + type: "annotation" + format: "bigbed" + name: "Differentially bound sites" + height: 40 + + smpl_corr_raw_plot_png: + type: File? + label: "Read counts correlation between the samples (raw)" + doc: | + Read counts correlation between the samples. + On the basis of the raw read counts within + the reference genomic bins. + PNG format. + outputSource: manorm/smpl_corr_raw_plot_png + "sd:visualPlugins": + - image: + tab: "Exploratory plots" + Caption: "Read counts correlation between the samples (raw)" + + smpl_corr_crtd_plot_png: + type: File? + label: "Read counts correlation between the samples (batch corrected)" + doc: | + Read counts correlation between the samples. + On the basis of the batch corrected raw read + counts within the reference genomic bins + PNG format. + outputSource: manorm/smpl_corr_crtd_plot_png + "sd:visualPlugins": + - image: + tab: "Exploratory plots" + Caption: "Read counts correlation between the samples (batch corrected)" + + smpl_corr_norm_plot_png: + type: File? + label: "Read counts correlation between the samples (normalized)" + doc: | + Read counts correlation between the samples. + On the basis of the optionally batch corrected + normalized read counts within the reference + genomic bins. + PNG format. + outputSource: manorm/smpl_corr_norm_plot_png + "sd:visualPlugins": + - image: + tab: "Exploratory plots" + Caption: "Read counts correlation between the samples (normalized)" + + smpl_vrlp_plot_png: + type: File? + label: "Peaks overlap between the samples" + doc: | + Peaks overlap between the samples. On the + basis of the occupied by each sample + reference genomic bins. + PNG format. + outputSource: manorm/smpl_vrlp_plot_png + "sd:visualPlugins": + - image: + tab: "Exploratory plots" + Caption: "Peaks overlap between the samples" + + cnd_vrlp_plot_png: + type: File? + label: "Peaks overlap between the biological conditions" + doc: | + Peaks overlap between the biological conditions. + On the basis of the occupied by each biological + condition reference genomic bins. + PNG format. + outputSource: manorm/cnd_vrlp_plot_png + "sd:visualPlugins": + - image: + tab: "Exploratory plots" + Caption: "Peaks overlap between the biological conditions" + + ma_corr_plot_png: + type: File? + label: "Correlation between M and A values" + doc: | + Correlation between M and A values across the + common peak regions of either each pair of + biological conditions or each pair of samples. + PNG format. + outputSource: manorm/ma_corr_plot_png + "sd:visualPlugins": + - image: + tab: "Exploratory plots" + Caption: "Correlation between M and A values" + + diff_vlcn_plot_png: + type: File? + label: "Volcano plot for differentially bound sites" + doc: | + Volcano plot for differentially bound sites. + PNG format. + outputSource: manorm/diff_vlcn_plot_png + "sd:visualPlugins": + - image: + tab: "Differential plots" + Caption: "Volcano plot for differentially bound sites" + + diff_ma_plot_png: + type: File? + label: "MA-plot for differentially bound sites" + doc: | + MA-plot for differentially bound sites. + PNG format. + outputSource: manorm/diff_ma_plot_png + "sd:visualPlugins": + - image: + tab: "Differential plots" + Caption: "MA-plot for differentially bound sites" + + pca_1_2_plot_png: + type: File? + label: "Read counts PCA (PC1/PC2)." + doc: | + Read counts PCA (PC1/PC2). + PNG format. + outputSource: manorm/pca_1_2_plot_png + "sd:visualPlugins": + - image: + tab: "Exploratory plots" + Caption: "PCA (1,2) of not filtered normalized counts" + + pca_2_3_plot_png: + type: File? + label: "Read counts PCA (PC2/PC3)" + doc: | + Read counts PCA (PC2/PC3). + PNG format. + outputSource: manorm/pca_2_3_plot_png + "sd:visualPlugins": + - image: + tab: "Exploratory plots" + Caption: "PCA (2,3) of not filtered normalized counts" + + mds_plot_html: + type: File? + outputSource: manorm/mds_plot_html + label: "MDS plot" + doc: | + MDS plot of optionally batch corrected + normalized read counts within the + reference genomic bins. + HTML format. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + + diff_rgns_tsv: + type: File + label: "Differentially bound sites with assigned nearest genes" + doc: | + Differentially bound sites, not filtered + by adjusted P-value threshold, with the + assigned nearest genes. + TSV format. + outputSource: restore_columns/output_file + "sd:visualPlugins": + - syncfusiongrid: + tab: "Differentially bound sites" + Title: "Differentially bound sites" + + diff_rgns_labeled_tsv: + type: File + label: "Differentially bound sites with labels" + doc: | + Differentially bound sites, not filtered + by adjusted P-value threshold, with the + labels. + TSV format. + outputSource: add_label_column/output_file + + volcano_plot_html_file: + type: File + label: "Volcano Plot" + doc: | + Volcano Plot html index. + outputSource: make_volcano_plot/html_file + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + + volcano_plot_html_data: + type: Directory + label: "Volcano Plot (data)" + doc: | + Volcano Plot html data. + outputSource: make_volcano_plot/html_data + + ma_plot_html_file: + type: File + label: "MA-plot" + doc: | + MA-plot html index. + outputSource: make_ma_plot/html_file + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + + ma_plot_html_data: + type: Directory + label: "MA-plot (data)" + doc: | + MA-plot html data. + outputSource: make_ma_plot/html_data + + heatmap_html: + type: File + label: "Heatmap of normalized read counts within the reference genomic bins" + doc: | + Morpheus heatmap html index. + outputSource: morpheus_heatmap/heatmap_html + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + + read_cnts_gct: + type: File + label: "Normalized read counts within the reference genomic bins" + doc: | + Optionally batch corrected normalized + read counts within the reference + genomic bins. + GCT format + outputSource: extend_gct/extended_gct + + experiment_info: + type: File + label: "Samples order for IGV" + doc: | + Markdown file to explain the sample + order for IGV. + outputSource: create_metadata/output_file + "sd:visualPlugins": + - markdownView: + tab: "Overview" + + pdf_plots: + type: File + outputSource: compress_pdf_plots/compressed_folder + label: "Compressed folder with all PDF plots" + doc: | + Compressed folder with all PDF plots. + + manorm_stdout_log: + type: File + label: "MAnorm output log" + doc: | + Stdout log from the manorm step. + outputSource: manorm/stdout_log + + manorm_stderr_log: + type: File + label: "MAnorm error log" + doc: | + Stderr log from the manorm step. + outputSource: manorm/stderr_log + + morpheus_stdout_log: + type: File + label: "Morpheus output log" + doc: | + Stdout log from the morpheus_heatmap step. + outputSource: morpheus_heatmap/stdout_log + + morpheus_stderr_log: + type: File + label: "Morpheus error log" + doc: | + Stderr log from the morpheus_heatmap step. + outputSource: morpheus_heatmap/stderr_log + + +steps: + + pipe: + run: + cwlVersion: v1.0 + class: ExpressionTool + inputs: + genome_cov_files_cond_1: + type: File[] + genome_cov_files_cond_2: + type: File[] + narrow_peak_files_cond_1: + type: + - "null" + - File[] + narrow_peak_files_cond_2: + type: + - "null" + - File[] + broad_peak_files_cond_1: + type: + - "null" + - File[] + broad_peak_files_cond_2: + type: + - "null" + - File[] + outputs: + coverage_files_cond_1: + type: File[] + coverage_files_cond_2: + type: File[] + n_peak_files_cond_1: + type: + - "null" + - File[] + n_peak_files_cond_2: + type: + - "null" + - File[] + b_peak_files_cond_1: + type: + - "null" + - File[] + b_peak_files_cond_2: + type: + - "null" + - File[] + expression: | + ${ + var results = {}; + var output_names = [ + "coverage_files_cond_1", + "coverage_files_cond_2", + "n_peak_files_cond_1", + "n_peak_files_cond_2", + "b_peak_files_cond_1", + "b_peak_files_cond_2" + ]; + var sources = [ + inputs.genome_cov_files_cond_1, + inputs.genome_cov_files_cond_2, + inputs.narrow_peak_files_cond_1, + inputs.narrow_peak_files_cond_2, + inputs.broad_peak_files_cond_1, + inputs.broad_peak_files_cond_2 + ]; + for (var i = 0; i < sources.length; i++){ + var current_source = sources[i]; + var current_output_name = output_names[i]; + results[current_output_name] = null; + + if (current_source != null && current_source.length > 0){ + for (var j = 0; j < current_source.length; j++){ + var new_item = current_source[j]; + new_item["basename"] = "u" + "_" + i + "_" + j+ "_" + new_item.basename; + if (results[current_output_name] == null){ + results[current_output_name] = [new_item]; + } else { + results[current_output_name].push(new_item); + } + } + } + } + return results; + } + in: + genome_cov_files_cond_1: genome_cov_files_cond_1 + genome_cov_files_cond_2: genome_cov_files_cond_2 + narrow_peak_files_cond_1: narrow_peak_files_cond_1 + narrow_peak_files_cond_2: narrow_peak_files_cond_2 + broad_peak_files_cond_1: broad_peak_files_cond_1 + broad_peak_files_cond_2: broad_peak_files_cond_2 + out: + - coverage_files_cond_1 + - coverage_files_cond_2 + - n_peak_files_cond_1 + - n_peak_files_cond_2 + - b_peak_files_cond_1 + - b_peak_files_cond_2 + + manorm: + run: ../tools/manorm2.cwl + in: + read_files_cond_1: read_files_cond_1 + read_files_cond_2: read_files_cond_2 + peak_files_cond_1: + source: + - narrow_peak_files_cond_1 # [0] + - narrow_peak_files_cond_2 # [1] + - broad_peak_files_cond_1 # [2] + - broad_peak_files_cond_2 # [3] + valueFrom: | + ${ + if (self[2] && self[3]){ + return self[2]; + } + else { + return self[0]; + } + } + peak_files_cond_2: + source: + - narrow_peak_files_cond_1 # [0] + - narrow_peak_files_cond_2 # [1] + - broad_peak_files_cond_1 # [2] + - broad_peak_files_cond_2 # [3] + valueFrom: | + ${ + if (self[2] && self[3]){ + return self[3]; + } + else { + return self[1]; + } + } + sample_names_cond_1: sample_names_cond_1 + sample_names_cond_2: sample_names_cond_2 + summit_files_cond_1: summit_files_cond_1 + summit_files_cond_2: summit_files_cond_2 + condition_1: condition_1 + condition_2: condition_2 + minimum_overlap: minimum_overlap + maximum_padj: maximum_padj + batch_metadata_file: batch_metadata_file + maximum_peak_number: + source: maximum_peak_number + valueFrom: | + ${ + if (self == 0){ + return null; + } + else { + return self; + } + } + minimum_peak_gap: minimum_peak_gap + bin_size: bin_size + fixed_bin_size: fixed_bin_size + blacklist_regions_file: blacklist_regions_file + remove_duplicated_reads: remove_duplicated_reads + shift_size: shift_size + paired_end: paired_end + exclude_chromosomes: + source: exclude_chromosomes + valueFrom: $(split_features(self)) + normalization_method: normalization_method + export_pdf_plots: + default: true + parallel_memory_limit: + default: 32 + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - peak_profile_bins_xls + - diff_rgns_tsv + - smpl_corr_raw_plot_png + - smpl_corr_crtd_plot_png + - smpl_corr_norm_plot_png + - smpl_vrlp_plot_png + - cnd_vrlp_plot_png + - ma_corr_plot_png + - diff_vlcn_plot_png + - diff_ma_plot_png + - pca_1_2_plot_png + - pca_2_3_plot_png + - mds_plot_html + - read_cnts_gct + - all_plots_pdf + - stderr_log + - stdout_log + + folder_pdf_plots: + run: ../tools/files-to-folder.cwl + in: + input_files: + source: + - manorm/all_plots_pdf + valueFrom: $(self.flat().filter(n => n)) + folder_basename: + default: "pdf_plots" + out: + - folder + + compress_pdf_plots: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: folder_pdf_plots/folder + out: + - compressed_folder + + filter_columns: + run: ../tools/custom-bash.cwl + in: + input_file: manorm/diff_rgns_tsv + script: + default: > + cat $0 | grep -v "Start" | awk + 'BEGIN {print "chr\tstart\tend\tlength\tabs_summit\tpileup\t-log10(pvalue)\tfold_enrichment\t-log10(qvalue)\tname"} + {print $1"\t"$2"\t"$3"\t"$3-$2+1"\t0\t"NR"\t0\t0\t0\t0"}' > `basename $0` + out: + - output_file + + assign_genes: + run: ../tools/iaintersect.cwl + in: + input_filename: filter_columns/output_file + annotation_filename: annotation_file + promoter_bp: promoter_dist + upstream_bp: upstream_dist + out: + - result_file + + restore_columns: + run: ../tools/custom-bash.cwl + in: + input_file: + - assign_genes/result_file + - manorm/diff_rgns_tsv + script: + default: | + cat $0 | grep -v "start" | sort -k 11n | cut -f 1-5,15 > iaintersect_result.tsv + cat $1 | grep -v "start" > manorm_result.tsv + HEADER=`head -n 1 $1`; + echo -e "refseq_id\tgene_id\ttxStart\ttxEnd\tstrand\tregion\t${HEADER}" > `basename $0`; + cat iaintersect_result.tsv | paste - manorm_result.tsv >> `basename $0` + rm iaintersect_result.tsv manorm_result.tsv + out: + - output_file + + convert_to_bed: + run: ../tools/custom-bash.cwl + in: + input_file: restore_columns/output_file + script: + default: | + cat "$0" | awk -F "\t" 'NR==1 {for (i=1; i<=NF; i++) {ix[$i]=i} } NR>1 {color="255,0,0"; if ($ix["log2FoldChange"]<0) color="0,255,0"; print $ix["chr"]"\t"$ix["start"]"\t"$ix["end"]"\tpvalue="$ix["pvalue"]+0.0";padj="$ix["padj"]+0.0";log2FC="$ix["log2FoldChange"]"\t"1000"\t"$ix["strand"]"\t"$ix["start"]"\t"$ix["end"]"\t"color}' > `basename $0` + out: + - output_file + + sort_bed: + run: ../tools/linux-sort.cwl + in: + unsorted_file: convert_to_bed/output_file + key: + default: ["1,1","2,2n"] + out: + - sorted_file + + bed_to_bigbed: + run: ../tools/ucsc-bedtobigbed.cwl + in: + input_bed: sort_bed/sorted_file + bed_type: + default: "bed4+5" + chrom_length_file: chrom_length_file + output_filename: + source: sort_bed/sorted_file + valueFrom: $(self.basename.split('.').slice(0,-1).join('.') + ".bigBed") + out: + - bigbed_file + + add_label_column: + run: ../tools/custom-bash.cwl + in: + input_file: manorm/diff_rgns_tsv + script: + default: | + HEADER=`head -n 1 $0`; + echo -e "label\t${HEADER}" > diff_rgns_labeled.tsv; + cat "$0" | grep -v "start" | awk -F "\t" '{print $1":"$2"-"$3"\t"$0}' >> diff_rgns_labeled.tsv + out: + - output_file + + make_volcano_plot: + run: ../tools/volcano-plot.cwl + in: + diff_expr_file: add_label_column/output_file + x_axis_column: + default: "log2FoldChange" + y_axis_column: + default: "padj" + label_column: + default: "label" + out: + - html_data + - html_file + + make_ma_plot: + run: ../tools/ma-plot.cwl + in: + diff_expr_file: add_label_column/output_file + x_axis_column: + default: "baseMean" + y_axis_column: + default: "log2FoldChange" + label_column: + default: "label" + out: + - html_data + - html_file + + extend_gct: + run: + cwlVersion: v1.0 + class: CommandLineTool + hints: + - class: DockerRequirement + dockerPull: biowardrobe2/morpheus:v0.0.2 + - class: InitialWorkDirRequirement + listing: + - entryname: extend.R + entry: | + options(error=function(){traceback(3); quit(save="no", status=1, runLast=FALSE)}) + suppressMessages(library("cmapR")) + suppressMessages(library("dplyr")) + suppressMessages(library("tibble")) + suppressMessages(library("morpheus")) + suppressMessages(library("argparse")) + args = commandArgs(trailingOnly=TRUE) + gct_data <- read.gct(args[1]) + metadata <- read.table(args[2], sep="\t", header=TRUE, check.names=FALSE, stringsAsFactors=FALSE) %>% + mutate(id=paste(chr, paste(start, end, sep="-"), sep=":")) %>% + select(id, gene_id, region) + row_metadata <- gct_data$rowAnnotations %>% + rownames_to_column("id") %>% + left_join(metadata, by="id") %>% + mutate_at("id", as.vector) + col_metadata <- gct_data$columnAnnotations %>% + rownames_to_column("id") %>% + mutate_at("id", as.vector) + gct_data <- new( + "GCT", + mat=gct_data$data[row_metadata$id, col_metadata$id], + rdesc=row_metadata, + cdesc=col_metadata + ) + write_gct(ds=gct_data, ofile="extended.gct", appenddim=FALSE) + inputs: + input_files: + type: File[] + inputBinding: + position: 5 + outputs: + extended_gct: + type: File + outputBinding: + glob: "extended.gct" + baseCommand: ["Rscript", "extend.R"] + in: + input_files: + - manorm/read_cnts_gct + - restore_columns/output_file + out: + - extended_gct + + morpheus_heatmap: + run: ../tools/morpheus-heatmap.cwl + in: + read_counts_gct: extend_gct/extended_gct + out: + - heatmap_html + - stdout_log + - stderr_log + + create_metadata: + run: ../tools/custom-bash.cwl + in: + input_file: + source: + - read_files_cond_1 + - read_files_cond_2 + valueFrom: $(self.flat().filter(n => n)) + param: + source: + - sample_names_cond_1 + - sample_names_cond_2 + valueFrom: $(self.flat().filter(n => n)) + script: + default: | + #!/bin/bash + set -- "$0" "$@" + COUNT=`expr $# / 2` + echo "| Sample | Index |" > experiment_info.md + echo "| :-- | --: |" >> experiment_info.md + j=1 + for i in "${@:$COUNT+1:$#}"; do + echo "| $i | $j |" >> experiment_info.md + (( j++ )) + done; + out: + - output_file + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "MAnorm2 for Normalizing and Comparing ChIP-Seq/ATAC-Seq Samples" +s:name: "MAnorm2 for Normalizing and Comparing ChIP-Seq/ATAC-Seq Samples" +s:alternateName: "MAnorm2 for Normalizing and Comparing ChIP-Seq/ATAC-Seq Samples" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/manorm2.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: + - class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:michael.kotliar@cchmc.org + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + +doc: | + MAnorm2 for Normalizing and Comparing ChIP-Seq/ATAC-Seq Samples \ No newline at end of file From 1af241d4003829066788308f4f09f4689ef96beb Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 18 Jun 2024 15:07:06 -0400 Subject: [PATCH 142/162] Make MAnorm2 to show only significant diff. peaks in IGV --- workflows/manorm2.cwl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/workflows/manorm2.cwl b/workflows/manorm2.cwl index 2bcd019c..04055d94 100644 --- a/workflows/manorm2.cwl +++ b/workflows/manorm2.cwl @@ -1093,9 +1093,12 @@ steps: run: ../tools/custom-bash.cwl in: input_file: restore_columns/output_file + param: + source: maximum_padj + valueFrom: $(self + "") # to convert it to string script: default: | - cat "$0" | awk -F "\t" 'NR==1 {for (i=1; i<=NF; i++) {ix[$i]=i} } NR>1 {color="255,0,0"; if ($ix["log2FoldChange"]<0) color="0,255,0"; print $ix["chr"]"\t"$ix["start"]"\t"$ix["end"]"\tpvalue="$ix["pvalue"]+0.0";padj="$ix["padj"]+0.0";log2FC="$ix["log2FoldChange"]"\t"1000"\t"$ix["strand"]"\t"$ix["start"]"\t"$ix["end"]"\t"color}' > `basename $0` + cat "$0" | awk -F "\t" -v maximum_padj="$1" 'NR==1 {for (i=1; i<=NF; i++) {ix[$i]=i} } NR>1 && $ix["padj"]<=maximum_padj {color="255,0,0"; if ($ix["log2FoldChange"]<0) color="0,255,0"; print $ix["chr"]"\t"$ix["start"]"\t"$ix["end"]"\tpvalue="$ix["pvalue"]+0.0";padj="$ix["padj"]+0.0";log2FC="$ix["log2FoldChange"]"\t"1000"\t"$ix["strand"]"\t"$ix["start"]"\t"$ix["end"]"\t"color}' > `basename $0` out: - output_file From 82c673bfab16c33ee35c5d776135995e752d5b6e Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 20 Jun 2024 11:51:05 -0400 Subject: [PATCH 143/162] Mark old QuantSeq pipeline as deprecated. Connect new QuantSeq to DESeq --- workflows/bedtools-multicov.cwl | 1 - workflows/feature-merge.cwl | 1 - workflows/genelists-deseq-diffbind.cwl | 1 - workflows/pca.cwl | 1 - ...im-quantseq-mrnaseq-se-strand-specific.cwl | 40 +++++++++---------- workflows/trim-quantseq-mrnaseq-se.cwl | 6 +-- 6 files changed, 21 insertions(+), 29 deletions(-) diff --git a/workflows/bedtools-multicov.cwl b/workflows/bedtools-multicov.cwl index 8838c3f6..c7461474 100644 --- a/workflows/bedtools-multicov.cwl +++ b/workflows/bedtools-multicov.cwl @@ -26,7 +26,6 @@ requirements: - "trim-rnaseq-pe-dutp.cwl" - "trim-rnaseq-se-dutp.cwl" - "trim-rnaseq-pe-smarter-dutp.cwl" - - "trim-quantseq-mrnaseq-se.cwl" - "trim-quantseq-mrnaseq-se-strand-specific.cwl" diff --git a/workflows/feature-merge.cwl b/workflows/feature-merge.cwl index 999b49c9..b2767b71 100644 --- a/workflows/feature-merge.cwl +++ b/workflows/feature-merge.cwl @@ -22,7 +22,6 @@ requirements: - "trim-rnaseq-pe-dutp.cwl" - "trim-rnaseq-pe-smarter-dutp.cwl" - "trim-rnaseq-se-dutp.cwl" - - "trim-quantseq-mrnaseq-se.cwl" inputs: diff --git a/workflows/genelists-deseq-diffbind.cwl b/workflows/genelists-deseq-diffbind.cwl index bdd1c0f0..5cf22d0f 100644 --- a/workflows/genelists-deseq-diffbind.cwl +++ b/workflows/genelists-deseq-diffbind.cwl @@ -37,7 +37,6 @@ requirements: - "trim-rnaseq-pe-dutp.cwl" - "trim-rnaseq-pe-smarter-dutp.cwl" - "trim-rnaseq-se-dutp.cwl" - - "trim-quantseq-mrnaseq-se.cwl" inputs: diff --git a/workflows/pca.cwl b/workflows/pca.cwl index 006c1372..c02ed133 100644 --- a/workflows/pca.cwl +++ b/workflows/pca.cwl @@ -22,7 +22,6 @@ requirements: - "trim-rnaseq-pe-dutp.cwl" - "trim-rnaseq-pe-smarter-dutp.cwl" - "trim-rnaseq-se-dutp.cwl" - - "trim-quantseq-mrnaseq-se.cwl" inputs: diff --git a/workflows/trim-quantseq-mrnaseq-se-strand-specific.cwl b/workflows/trim-quantseq-mrnaseq-se-strand-specific.cwl index c8dd8f5d..8085a120 100644 --- a/workflows/trim-quantseq-mrnaseq-se-strand-specific.cwl +++ b/workflows/trim-quantseq-mrnaseq-se-strand-specific.cwl @@ -60,7 +60,7 @@ inputs: - File - type: array items: File - label: "FASTQ input file" + label: "FASTQ input file(s)" format: "http://edamontology.org/format_1930" doc: "Reads data in a FASTQ format" @@ -176,7 +176,7 @@ outputs: doc: "STAR Log.progress.out" outputSource: star_aligner/log_progress - star_stdout_log_file: + star_stdout_log: type: File? format: "http://edamontology.org/format_2330" label: "STAR stdout log" @@ -235,32 +235,25 @@ outputs: doc: "Bowtie alignment log file" outputSource: bowtie_aligner/log_file - rpkm_isoforms: - type: File - format: "http://edamontology.org/format_3475" - label: "Transcript expression" - doc: "Transcript expression (not actually RPKM, name needed for deseq input)" - outputSource: group_transcript_expression/transcript_expression_file - - rpkm_genes: + gene_expression_file: type: File format: "http://edamontology.org/format_3475" label: "Gene expression" - doc: "Gene expression (not actually RPKM, name needed for deseq input)" + doc: "Gene expression" outputSource: group_transcript_expression/gene_expression_file 'sd:visualPlugins': - syncfusiongrid: tab: 'Gene Expression' Title: 'Read counts grouped by gene' - rpkm_common_tss: - type: File - format: "http://edamontology.org/format_3475" - label: "Common TSS expression" - doc: "Common TSS expression (not actually RPKM, name needed for deseq input)" - outputSource: group_transcript_expression/common_tss_expression_file + # common_tss_expression_file: + # type: File + # format: "http://edamontology.org/format_3475" + # label: "Common TSS expression" + # doc: "Common TSS expression" + # outputSource: group_transcript_expression/common_tss_expression_file - geep_gene_expression_file: + rpkm_genes: type: File format: "http://edamontology.org/format_3475" label: "GEEP: expression grouped by gene name" @@ -424,6 +417,8 @@ steps: run: ../tools/extract-fastq.cwl in: compressed_file: fastq_file + output_prefix: + default: "read_1" out: - fastq_file @@ -559,7 +554,8 @@ steps: valueFrom: $(get_root(self.basename)+".bam") threads: threads trigger: use_umi - out: [bam_bai_pair] + out: + - bam_bai_pair htseq_count_transcript_expression: run: ../tools/htseq-count.cwl @@ -860,8 +856,8 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -s:name: "QuantSeq 3' mRNA-Seq single-read" -label: "QuantSeq 3' mRNA-Seq single-read" +s:name: "QuantSeq 3' FWD, FWD-UMI or REV for single-read mRNA-Seq data" +label: "QuantSeq 3' FWD, FWD-UMI or REV for single-read mRNA-Seq data" s:alternateName: "Runs QuantSeq 3' FWD, FWD-UMI or REV analysis for single-read mRNA-Seq data" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/trim-quantseq-mrnaseq-se-strand-specific.cwl @@ -893,4 +889,4 @@ s:creator: doc: | - ### QuantSeq 3' FWD, FWD-UMI or REV for single-read mRNA-Seq data \ No newline at end of file + ### QuantSeq 3' FWD, FWD-UMI or REV for single-read mRNA-Seq data \ No newline at end of file diff --git a/workflows/trim-quantseq-mrnaseq-se.cwl b/workflows/trim-quantseq-mrnaseq-se.cwl index 7d111ad9..e86caf10 100644 --- a/workflows/trim-quantseq-mrnaseq-se.cwl +++ b/workflows/trim-quantseq-mrnaseq-se.cwl @@ -665,9 +665,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -s:name: "QuantSeq 3' mRNA-Seq single-read" -label: "QuantSeq 3' mRNA-Seq single-read" -s:alternateName: "Run QuantSeq 3' mRNA-Seq basic analysis with single-end data file" +s:name: "Deprecated. QuantSeq 3' mRNA-Seq single-read" +label: "Deprecated.QuantSeq 3' mRNA-Seq single-read" +s:alternateName: "Deprecated. Run QuantSeq 3' mRNA-Seq basic analysis with single-end data file" s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/trim-quantseq-mrnaseq-se.cwl s:codeRepository: https://github.com/datirium/workflows From 05c0c65f313d376fc358ff44d00f4ecbe9ff04cb Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 20 Jun 2024 15:01:53 -0400 Subject: [PATCH 144/162] Update label to enable saving to Loupe browser --- workflows/sc-ctype-assign.cwl | 2 +- workflows/sc-multiome-filter.cwl | 2 +- workflows/sc-rna-cluster.cwl | 2 +- workflows/sc-rna-da-cells.cwl | 2 +- workflows/sc-rna-filter.cwl | 2 +- workflows/sc-rna-reduce.cwl | 2 +- workflows/sc-rna-trajectory.cwl | 2 +- workflows/sc-triangulate.cwl | 2 +- workflows/sc-vdj-profile.cwl | 2 +- workflows/sc-wnn-cluster.cwl | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index 2365077b..43c78c78 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -179,7 +179,7 @@ inputs: export_loupe_data: type: boolean? default: false - label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + label: "Save raw counts to Loupe file. I confirm that data is generated by 10x technology and accept the EULA available at https://10xgen.com/EULA" doc: | Save raw counts from the RNA assay to Loupe file. By enabling this feature you accept the End-User License diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index 19b40da8..381ffd49 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -427,7 +427,7 @@ inputs: export_loupe_data: type: boolean? default: false - label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + label: "Save raw counts to Loupe file. I confirm that data is generated by 10x technology and accept the EULA available at https://10xgen.com/EULA" doc: | Save raw counts from the RNA assay to Loupe file. By enabling this feature you accept the End-User License diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index 1f1ada82..ec45990a 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -108,7 +108,7 @@ inputs: export_loupe_data: type: boolean? default: false - label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + label: "Save raw counts to Loupe file. I confirm that data is generated by 10x technology and accept the EULA available at https://10xgen.com/EULA" doc: | Save raw counts from the RNA assay to Loupe file. By enabling this feature you accept the End-User License diff --git a/workflows/sc-rna-da-cells.cwl b/workflows/sc-rna-da-cells.cwl index a2fb734a..06d8682b 100644 --- a/workflows/sc-rna-da-cells.cwl +++ b/workflows/sc-rna-da-cells.cwl @@ -107,7 +107,7 @@ inputs: export_loupe_data: type: boolean? default: false - label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + label: "Save raw counts to Loupe file. I confirm that data is generated by 10x technology and accept the EULA available at https://10xgen.com/EULA" doc: | Save raw counts from the RNA assay to Loupe file. By enabling this feature you accept the End-User License diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index e82914d6..4778af6d 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -239,7 +239,7 @@ inputs: export_loupe_data: type: boolean? default: false - label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + label: "Save raw counts to Loupe file. I confirm that data is generated by 10x technology and accept the EULA available at https://10xgen.com/EULA" doc: | Save raw counts from the RNA assay to Loupe file. By enabling this feature you accept the End-User License diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index c67915d1..654db337 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -260,7 +260,7 @@ inputs: export_loupe_data: type: boolean? default: false - label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + label: "Save raw counts to Loupe file. I confirm that data is generated by 10x technology and accept the EULA available at https://10xgen.com/EULA" doc: | Save raw counts from the RNA assay to Loupe file. By enabling this feature you accept the End-User License diff --git a/workflows/sc-rna-trajectory.cwl b/workflows/sc-rna-trajectory.cwl index 9e08eeba..a3915566 100644 --- a/workflows/sc-rna-trajectory.cwl +++ b/workflows/sc-rna-trajectory.cwl @@ -122,7 +122,7 @@ inputs: export_loupe_data: type: boolean? default: false - label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + label: "Save raw counts to Loupe file. I confirm that data is generated by 10x technology and accept the EULA available at https://10xgen.com/EULA" doc: | Save raw counts from the RNA assay to Loupe file. By enabling this feature you accept the End-User License diff --git a/workflows/sc-triangulate.cwl b/workflows/sc-triangulate.cwl index 347c2906..ccee7346 100644 --- a/workflows/sc-triangulate.cwl +++ b/workflows/sc-triangulate.cwl @@ -83,7 +83,7 @@ inputs: export_loupe_data: type: boolean? default: false - label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + label: "Save raw counts to Loupe file. I confirm that data is generated by 10x technology and accept the EULA available at https://10xgen.com/EULA" doc: | Save raw counts from the RNA assay to Loupe file. By enabling this feature you accept the End-User License diff --git a/workflows/sc-vdj-profile.cwl b/workflows/sc-vdj-profile.cwl index 26f3a1ad..781f7153 100644 --- a/workflows/sc-vdj-profile.cwl +++ b/workflows/sc-vdj-profile.cwl @@ -149,7 +149,7 @@ inputs: export_loupe_data: type: boolean? default: false - label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + label: "Save raw counts to Loupe file. I confirm that data is generated by 10x technology and accept the EULA available at https://10xgen.com/EULA" doc: | Save raw counts from the RNA assay to Loupe file. By enabling this feature you accept the End-User License diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index 47645588..00636685 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -157,7 +157,7 @@ inputs: export_loupe_data: type: boolean? default: false - label: "Save raw counts to Loupe file by accepting the EULA available at https://10xgen.com/EULA" + label: "Save raw counts to Loupe file. I confirm that data is generated by 10x technology and accept the EULA available at https://10xgen.com/EULA" doc: | Save raw counts from the RNA assay to Loupe file. By enabling this feature you accept the End-User License From 0576a75e592f9c17c2ad93d431ead3fc47bb9838 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 21 Jun 2024 11:37:52 -0400 Subject: [PATCH 145/162] Updated upstream of genelist deseq only to use not deprecated QuantSeq --- workflows/genelists-deseq-only.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/genelists-deseq-only.cwl b/workflows/genelists-deseq-only.cwl index 74266112..08e9fa18 100644 --- a/workflows/genelists-deseq-only.cwl +++ b/workflows/genelists-deseq-only.cwl @@ -28,7 +28,7 @@ requirements: - "trim-rnaseq-pe-dutp.cwl" - "trim-rnaseq-pe-smarter-dutp.cwl" - "trim-rnaseq-se-dutp.cwl" - - "trim-quantseq-mrnaseq-se.cwl" + - "trim-quantseq-mrnaseq-se-strand-specific.cwl" inputs: From e4240d72e513fa4d2e815f07990a54849f3eb213 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 21 Jun 2024 11:50:22 -0400 Subject: [PATCH 146/162] Add the latest QuantSeq as an upstream to genelist deseq diffbind --- workflows/genelists-deseq-diffbind.cwl | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/genelists-deseq-diffbind.cwl b/workflows/genelists-deseq-diffbind.cwl index 5cf22d0f..a5f62ec0 100644 --- a/workflows/genelists-deseq-diffbind.cwl +++ b/workflows/genelists-deseq-diffbind.cwl @@ -37,6 +37,7 @@ requirements: - "trim-rnaseq-pe-dutp.cwl" - "trim-rnaseq-pe-smarter-dutp.cwl" - "trim-rnaseq-se-dutp.cwl" + - "trim-quantseq-mrnaseq-se-strand-specific.cwl" inputs: From fd6194044081e1ffd5d051297e496eec29005183 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 1 Jul 2024 14:16:31 -0400 Subject: [PATCH 147/162] Update sc tools docker image to the latest --- tools/sc-atac-cluster.cwl | 2 +- tools/sc-atac-coverage.cwl | 2 +- tools/sc-atac-dbinding.cwl | 2 +- tools/sc-atac-filter.cwl | 2 +- tools/sc-atac-reduce.cwl | 2 +- tools/sc-ctype-assign.cwl | 10 ++----- tools/sc-multiome-filter.cwl | 2 +- tools/sc-rna-cluster.cwl | 4 +-- tools/sc-rna-da-cells.cwl | 2 +- tools/sc-rna-de-pseudobulk.cwl | 2 +- tools/sc-rna-filter.cwl | 2 +- tools/sc-rna-reduce.cwl | 2 +- tools/sc-rna-trajectory.cwl | 2 +- tools/sc-triangulate.cwl | 2 +- tools/sc-vdj-profile.cwl | 2 +- tools/sc-wnn-cluster.cwl | 4 +-- workflows/sc-ctype-assign.cwl | 50 ++++++++++++++++------------------ workflows/sc-rna-cluster.cwl | 33 +++++++++++----------- workflows/sc-wnn-cluster.cwl | 35 ++++++++++++------------ 19 files changed, 76 insertions(+), 86 deletions(-) diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index ecffd4b8..75e0bf4b 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.38 + dockerPull: biowardrobe2/sc-tools:v0.0.39 inputs: diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index 59920c51..245d6dee 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.38 + dockerPull: biowardrobe2/sc-tools:v0.0.39 inputs: diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index 3f8dd891..63ba725b 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.38 + dockerPull: biowardrobe2/sc-tools:v0.0.39 inputs: diff --git a/tools/sc-atac-filter.cwl b/tools/sc-atac-filter.cwl index 7e7bd475..a027da83 100644 --- a/tools/sc-atac-filter.cwl +++ b/tools/sc-atac-filter.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.38 + dockerPull: biowardrobe2/sc-tools:v0.0.39 inputs: diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index c27f25e3..408a93b0 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.38 + dockerPull: biowardrobe2/sc-tools:v0.0.39 inputs: diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index f1afb965..bf28ca84 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.38 + dockerPull: biowardrobe2/sc-tools:v0.0.39 inputs: @@ -604,15 +604,11 @@ outputs: PNG format. xpr_dnst_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputBinding: - glob: "*_xpr_dnst_*.png" + glob: "*_xpr_dnst.png" doc: | Gene expression density. - All genes of interest. PNG format. xpr_htmp_plot_png: diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index 6a1bf6ad..f3dd4f74 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.38 + dockerPull: biowardrobe2/sc-tools:v0.0.39 inputs: diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index b42a75a7..8c01db74 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.38 + dockerPull: biowardrobe2/sc-tools:v0.0.39 inputs: @@ -554,7 +554,7 @@ outputs: glob: "*_xpr_dnst_res_*.png" doc: | Gene expression density. - All genes of interest; all resolutions. + All resolutions. PNG format. xpr_htmp_res_plot_png: diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index e04f8a2a..0ee481c1 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.38 + dockerPull: biowardrobe2/sc-tools:v0.0.39 inputs: diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index e37b918f..c04283f1 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.38 + dockerPull: biowardrobe2/sc-tools:v0.0.39 inputs: diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index fda371c2..31f7c28d 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.38 + dockerPull: biowardrobe2/sc-tools:v0.0.39 inputs: diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index d0029f41..913ac887 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.38 + dockerPull: biowardrobe2/sc-tools:v0.0.39 inputs: diff --git a/tools/sc-rna-trajectory.cwl b/tools/sc-rna-trajectory.cwl index 946d15d7..be8df3d6 100644 --- a/tools/sc-rna-trajectory.cwl +++ b/tools/sc-rna-trajectory.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.38 + dockerPull: biowardrobe2/sc-tools:v0.0.39 inputs: diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index 348aa131..e08ab3a8 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.38 + dockerPull: biowardrobe2/sc-tools:v0.0.39 inputs: diff --git a/tools/sc-vdj-profile.cwl b/tools/sc-vdj-profile.cwl index 7920d60c..7dbdf1b7 100644 --- a/tools/sc-vdj-profile.cwl +++ b/tools/sc-vdj-profile.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.38 + dockerPull: biowardrobe2/sc-tools:v0.0.39 inputs: diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index 36cbcb30..9562059d 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.38 + dockerPull: biowardrobe2/sc-tools:v0.0.39 inputs: @@ -749,7 +749,7 @@ outputs: glob: "*_xpr_dnst_res_*.png" doc: | Gene expression density. - All genes of interest; all resolutions. + All resolutions. PNG format. xpr_htmp_res_plot_png: diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index 43c78c78..72729360 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -513,49 +513,45 @@ outputs: tab: "Split by group" Caption: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" - xpr_per_cell_plot_png: - type: - - "null" - - type: array - items: File - outputSource: ctype_assign/xpr_per_cell_plot_png - label: "UMAP colored by gene expression (per gene)" + xpr_avg_plot_png: + type: File? + outputSource: ctype_assign/xpr_avg_plot_png + label: "Average gene expression" doc: | - UMAP colored by gene expression. - All genes of interest. + Average gene expression. PNG format. "sd:visualPlugins": - image: - tab: "Genes of interest (UMAP)" - Caption: "UMAP colored by gene expression (per gene)" + tab: "Genes of interest (expression)" + Caption: "Average gene expression" xpr_dnst_plot_png: - type: - - "null" - - type: array - items: File + type: File? outputSource: ctype_assign/xpr_dnst_plot_png - label: "Gene expression density (per gene)" + label: "Gene expression density" doc: | Gene expression density. - All genes of interest. PNG format. "sd:visualPlugins": - image: - tab: "Genes of interest (violin plot)" - Caption: "Gene expression density (per gene)" + tab: "Genes of interest (expression)" + Caption: "Gene expression density" - xpr_avg_plot_png: - type: File? - outputSource: ctype_assign/xpr_avg_plot_png - label: "Average gene expression" + xpr_per_cell_plot_png: + type: + - "null" + - type: array + items: File + outputSource: ctype_assign/xpr_per_cell_plot_png + label: "UMAP colored by gene expression (per gene)" doc: | - Average gene expression. + UMAP colored by gene expression. + All genes of interest. PNG format. "sd:visualPlugins": - image: - tab: "Genes of interest (dot plot)" - Caption: "Average gene expression" + tab: "Genes of interest (expression)" + Caption: "UMAP colored by gene expression (per gene)" cvrg_plot_png: type: @@ -570,7 +566,7 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Genes of interest (coverage plot)" + tab: "Genes of interest (coverage)" Caption: "ATAC fragment coverage (per gene)" xpr_htmp_plot_png: diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index ec45990a..e1f74240 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -442,21 +442,20 @@ outputs: tab: "Split by group" Caption: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" - xpr_per_cell_plot_png: + xpr_avg_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_rna_cluster/xpr_per_cell_plot_png - label: "UMAP colored by gene expression (per gene)" + outputSource: sc_rna_cluster/xpr_avg_res_plot_png + label: "Average gene expression" doc: | - UMAP colored by gene expression. - All genes of interest. + Average gene expression. PNG format. "sd:visualPlugins": - image: - tab: "Genes of interest (UMAP)" - Caption: "UMAP colored by gene expression (per gene)" + tab: "Genes of interest (expression)" + Caption: "Average gene expression" xpr_dnst_res_plot_png: type: @@ -464,30 +463,30 @@ outputs: - type: array items: File outputSource: sc_rna_cluster/xpr_dnst_res_plot_png - label: "Gene expression density (per gene)" + label: "Gene expression density" doc: | Gene expression density. - All genes of interest. PNG format. "sd:visualPlugins": - image: - tab: "Genes of interest (violin plot)" - Caption: "Gene expression density (per gene)" + tab: "Genes of interest (expression)" + Caption: "Gene expression density" - xpr_avg_res_plot_png: + xpr_per_cell_plot_png: type: - "null" - type: array items: File - outputSource: sc_rna_cluster/xpr_avg_res_plot_png - label: "Average gene expression" + outputSource: sc_rna_cluster/xpr_per_cell_plot_png + label: "UMAP colored by gene expression (per gene)" doc: | - Average gene expression. + UMAP colored by gene expression. + All genes of interest. PNG format. "sd:visualPlugins": - image: - tab: "Genes of interest (dot plot)" - Caption: "Average gene expression" + tab: "Genes of interest (expression)" + Caption: "UMAP colored by gene expression (per gene)" xpr_htmp_res_plot_png: type: diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index 00636685..ae3353aa 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -542,21 +542,20 @@ outputs: tab: "Split by group" Caption: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" - xpr_per_cell_plot_png: + xpr_avg_res_plot_png: type: - "null" - type: array items: File - outputSource: sc_wnn_cluster/xpr_per_cell_plot_png - label: "UMAP colored by gene expression (per gene)" + outputSource: sc_wnn_cluster/xpr_avg_res_plot_png + label: "Average gene expression" doc: | - UMAP colored by gene expression. - All genes of interest. + Average gene expression. PNG format. "sd:visualPlugins": - image: - tab: "Genes of interest (UMAP)" - Caption: "UMAP colored by gene expression (per gene)" + tab: "Genes of interest (expression)" + Caption: "Average gene expression" xpr_dnst_res_plot_png: type: @@ -564,30 +563,30 @@ outputs: - type: array items: File outputSource: sc_wnn_cluster/xpr_dnst_res_plot_png - label: "Gene expression density (per gene)" + label: "Gene expression density" doc: | Gene expression density. - All genes of interest. PNG format. "sd:visualPlugins": - image: - tab: "Genes of interest (violin plot)" - Caption: "Gene expression density (per gene)" + tab: "Genes of interest (expression)" + Caption: "Gene expression density" - xpr_avg_res_plot_png: + xpr_per_cell_plot_png: type: - "null" - type: array items: File - outputSource: sc_wnn_cluster/xpr_avg_res_plot_png - label: "Average gene expression" + outputSource: sc_wnn_cluster/xpr_per_cell_plot_png + label: "UMAP colored by gene expression (per gene)" doc: | - Average gene expression. + UMAP colored by gene expression. + All genes of interest. PNG format. "sd:visualPlugins": - image: - tab: "Genes of interest (dot plot)" - Caption: "Average gene expression" + tab: "Genes of interest (expression)" + Caption: "UMAP colored by gene expression (per gene)" cvrg_res_plot_png: type: @@ -602,7 +601,7 @@ outputs: PNG format. "sd:visualPlugins": - image: - tab: "Genes of interest (coverage plot)" + tab: "Genes of interest (coverage)" Caption: "ATAC fragment coverage (per gene)" xpr_htmp_res_plot_png: From f9bc591e51f605acc616d8a3359543b0976e4b57 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 10 Jul 2024 15:07:40 -0400 Subject: [PATCH 148/162] Update cellranger to the latest version 8 --- tools/cellbrowser-build-altanalyze.cwl | 60 +-- tools/cellbrowser-build-cellranger-arc.cwl | 186 +++++---- tools/cellbrowser-build-cellranger-atac.cwl | 82 ++-- tools/cellbrowser-build-cellranger.cwl | 78 ++-- tools/cellranger-aggr.cwl | 95 ++--- tools/cellranger-arc-aggr.cwl | 128 ++++--- tools/cellranger-arc-count.cwl | 236 +++++++----- tools/cellranger-arc-mkref.cwl | 31 +- tools/cellranger-atac-aggr.cwl | 107 +++--- tools/cellranger-atac-count.cwl | 118 ++++-- tools/cellranger-count.cwl | 235 +++++++----- tools/cellranger-mkref.cwl | 103 ++--- tools/cellranger-mkvdjref.cwl | 169 ++++++--- tools/cellranger-multi.cwl | 397 ++++++++++++-------- tools/cellranger-reanalyze.cwl | 353 ++++++++++------- tools/fastq-dump.cwl | 17 +- workflows/cellranger-aggr.cwl | 10 + workflows/cellranger-arc-aggr.cwl | 11 +- workflows/cellranger-arc-count.cwl | 49 +-- workflows/cellranger-atac-aggr.cwl | 5 + workflows/cellranger-atac-count.cwl | 5 + workflows/cellranger-mkvdjref.cwl | 34 ++ workflows/cellranger-multi.cwl | 76 ++-- 23 files changed, 1575 insertions(+), 1010 deletions(-) diff --git a/tools/cellbrowser-build-altanalyze.cwl b/tools/cellbrowser-build-altanalyze.cwl index fea19f04..ec2023b1 100644 --- a/tools/cellbrowser-build-altanalyze.cwl +++ b/tools/cellbrowser-build-altanalyze.cwl @@ -4,7 +4,7 @@ class: CommandLineTool hints: - class: DockerRequirement - dockerPull: biowardrobe2/cellbrowser:v0.0.2 + dockerPull: biowardrobe2/sc-tools:v0.0.39 requirements: @@ -13,31 +13,33 @@ requirements: listing: - entryname: cellbrowser.conf entry: | - name = "cellbrowser" - shortLabel="cellbrowser" + name = "RNA" + shortLabel = "RNA" priority = 1 - geneIdType="auto" - exprMatrix="expr_matrix.tsv" - meta="metadata.tsv" - coords=[ - { - "file":"coordinates.tsv", - "flipY" : False, - "shortLabel":"Clustering" - } + geneIdType = "auto" + exprMatrix = "expr_matrix.tsv" + meta = "metadata.tsv" + coords = [ + { + "file":"coordinates.tsv", + "flipY" : False, + "shortLabel":"Clustering" + } ] - markers=[ - { - "file":"markers.tsv", - "shortLabel":"Cluster-specific genes" - } + markers = [ + { + "file":"markers.tsv", + "shortLabel":"Cluster-specific genes" + } ] - enumFields = ["cell_ID"] - clusterField="Cluster" - labelField="Cell-Type-Prediction" + geneLabel = "Feature" + radius = 3 + alpha = 0.5 + clusterField = "Cluster" + labelField = "Cell-Type-Prediction" - entryname: desc.conf entry: | - title = "CellBrowser" + title = "RNA" abstract = "" methods = "" biorxiv_url = "" @@ -198,12 +200,13 @@ doc: | s:about: | Usage: cbBuild [options] -i cellbrowser.conf -o outputDir - add a dataset to the single cell viewer directory - If you have previously built into the same output directory with the same dataset and the expression matrix has not changed its filesize, this will be detected and the expression matrix will not be copied again. This means that an update of a few meta data attributes is quite quick. - + Gene symbol/annotation files are downloaded to ~/cellbrowserData when + needed. Config defaults can be specified in ~/.cellbrowser. See + documentation at https://cellbrowser.readthedocs.io/ Options: -h, --help show this help message and exit --init copy sample cellbrowser.conf and desc.conf to current @@ -215,11 +218,14 @@ s:about: | specified multiple times -o OUTDIR, --outDir=OUTDIR output directory, default can be set through the env. - variable CBOUT or ~/.cellbrowser.conf, current value: - none + variable CBOUT or ~/.cellbrowser, current value: none -p PORT, --port=PORT if build is successful, start an http server on this port and serve the result via http://localhost:port -r, --recursive run in all subdirectories of the current directory. - Useful when rebuilding a full hierarchy. + Useful when rebuilding a full hierarchy. Cannot be + used with -p. + --depth=DEPTH when using -r: only go this many directories deep --redo=REDO do not use cached old data. Can be: 'meta' or 'matrix' - (matrix includes meta). \ No newline at end of file + (matrix includes meta). + --force ignore errors that usually stop the build and go ahead + anyways. \ No newline at end of file diff --git a/tools/cellbrowser-build-cellranger-arc.cwl b/tools/cellbrowser-build-cellranger-arc.cwl index 90632cb2..ace53c76 100644 --- a/tools/cellbrowser-build-cellranger-arc.cwl +++ b/tools/cellbrowser-build-cellranger-arc.cwl @@ -4,23 +4,25 @@ class: CommandLineTool hints: - class: DockerRequirement - dockerPull: biowardrobe2/cellbrowser:v0.0.2 + dockerPull: biowardrobe2/sc-tools:v0.0.39 requirements: +- class: EnvVarRequirement + envDef: + CBDATAROOT: $(runtime.outdir) - class: InlineJavascriptRequirement - class: InitialWorkDirRequirement listing: - - entryname: cellbrowser_gex.conf + - entryname: cellbrowser_rna.conf entry: | - name = "GEX" - shortLabel="GEX" + name = "RNA" + shortLabel = "RNA" priority = 1 - geneIdType="auto" - geneLabel="Gene" - exprMatrix="exprMatrix.tsv.gz" - meta="meta.csv" - coords=[ + geneIdType = "auto" + exprMatrix = "exprMatrix.tsv.gz" + meta = "meta.csv" + coords = [ { "file": "tsne.coords.csv", "shortLabel": "t-SNE" @@ -30,26 +32,27 @@ requirements: "shortLabel": "UMAP" } ] - markers=[ - { - "file":"markers.tsv", - "shortLabel":"Cluster-specific genes" - } + markers = [ + { + "file": "markers.tsv", + "shortLabel": "Cluster-specific genes" + } ] - enumFields = ["Barcode"] - clusterField="Cluster" - labelField="Cluster" - dataRoot="../" + geneLabel = "Feature" + radius = 3 + alpha = 0.5 + clusterField = "Cluster" + labelField = "Cluster" + dataRoot = "../" - entryname: cellbrowser_atac.conf entry: | name = "ATAC" - shortLabel="ATAC" + shortLabel = "ATAC" priority = 1 - geneIdType="auto" - geneLabel="Peak" - exprMatrix="exprMatrix.tsv.gz" - meta="meta.csv" - coords=[ + geneIdType = "auto" + exprMatrix = "exprMatrix.tsv.gz" + meta = "meta.csv" + coords = [ { "file": "tsne.coords.csv", "shortLabel": "t-SNE" @@ -63,22 +66,25 @@ requirements: "shortLabel": "LSA" } ] - markers=[ - { - "file":"markers.tsv", - "shortLabel":"Cluster-specific peaks" - } + markers = [ + { + "file": "markers.tsv", + "shortLabel": "Cluster-specific peaks" + } ] - enumFields = ["Barcode"] - clusterField="Cluster" - labelField="Cluster" - dataRoot="../" + geneLabel = "Feature" + radius = 3 + alpha = 0.5 + clusterField = "Cluster" + labelField = "Cluster" + dataRoot = "../" + atacSearch = "genome.current" - entryname: cellbrowser.conf entry: | - shortLabel="Multiome" - - entryname: desc_gex.conf + shortLabel = "Multiple datasets" + - entryname: desc_rna.conf entry: | - title = "GEX" + title = "RNA" abstract = "" methods = "" biorxiv_url = "" @@ -97,6 +103,10 @@ inputs: type: string? default: | #!/bin/bash + echo "Splitting combined feature-barcode matrix into RNA and ATAC matrices" + sc_cb_utils_split_mex.R --mex $1 --output temp_sc + echo "Preparing ATAC search file" + sc_cb_utils_atac_search.R --annotations $2 echo "Preparing ATAC data" mkdir -p ./atac_input/analysis/clustering/graphclust \ ./atac_input/analysis/diffexp/graphclust \ @@ -109,7 +119,7 @@ inputs: cp -r $0/dimensionality_reduction/atac/umap_projection.csv ./atac_input/analysis/umap/2_components/projection.csv cp -r $0/dimensionality_reduction/atac/lsa_projection.csv ./atac_input/analysis/lsa/2_components/projection.csv mkdir -p ./atac_input/filtered_feature_bc_matrix - cp -r $1/* ./atac_input/filtered_feature_bc_matrix/ + cp -r temp_sc_atac/* ./atac_input/filtered_feature_bc_matrix/ echo "Importing ATAC data" cbImportCellranger -i atac_input -o atac --name atac cd ./atac @@ -121,78 +131,92 @@ inputs: rm -f cellbrowser.conf desc.conf cp ../cellbrowser_atac.conf cellbrowser.conf cp ../desc_atac.conf desc.conf - if [[ -n $2 ]]; then + if [[ -n $3 ]]; then echo "Aggregation metadata file was provided. Adding initial cell identity classes" - cat $2 | grep -v "library_id" | awk '{print NR","$0}' > aggregation_metadata.csv + cat $3 | grep -v "library_id" | awk '{print NR","$0}' > aggregation_metadata.csv cat meta.csv | grep -v "Barcode" > meta_headerless.csv - echo "Barcode,Cluster,Identity" > meta.csv + echo "Barcode,Cluster,Dataset" > meta.csv awk -F, 'NR==FNR {identity[$1]=$2; next} {split($1,barcode,"-"); print $0","identity[barcode[2]]}' aggregation_metadata.csv meta_headerless.csv >> meta.csv rm -f aggregation_metadata.csv meta_headerless.csv fi + cbBuild -o ../html_data cd .. - echo "Preparing GEX data" - mkdir -p ./gex_input/analysis/clustering/graphclust \ - ./gex_input/analysis/diffexp/graphclust \ - ./gex_input/analysis/tsne/2_components \ - ./gex_input/analysis/umap/2_components - cp -r $0/clustering/gex/graphclust/clusters.csv ./gex_input/analysis/clustering/graphclust/clusters.csv - cp -r $0/clustering/gex/graphclust/differential_expression.csv ./gex_input/analysis/diffexp/graphclust/differential_expression.csv - cp -r $0/dimensionality_reduction/gex/tsne_projection.csv ./gex_input/analysis/tsne/2_components/projection.csv - cp -r $0/dimensionality_reduction/gex/umap_projection.csv ./gex_input/analysis/umap/2_components/projection.csv - mkdir -p ./gex_input/filtered_feature_bc_matrix - cp -r $1/* ./gex_input/filtered_feature_bc_matrix/ - echo "Importing GEX data" - cbImportCellranger -i gex_input -o gex --name gex - cd ./gex + echo "Preparing RNA data" + mkdir -p ./rna_input/analysis/clustering/graphclust \ + ./rna_input/analysis/diffexp/graphclust \ + ./rna_input/analysis/tsne/2_components \ + ./rna_input/analysis/umap/2_components + cp -r $0/clustering/gex/graphclust/clusters.csv ./rna_input/analysis/clustering/graphclust/clusters.csv + cp -r $0/clustering/gex/graphclust/differential_expression.csv ./rna_input/analysis/diffexp/graphclust/differential_expression.csv + cp -r $0/dimensionality_reduction/gex/tsne_projection.csv ./rna_input/analysis/tsne/2_components/projection.csv + cp -r $0/dimensionality_reduction/gex/umap_projection.csv ./rna_input/analysis/umap/2_components/projection.csv + mkdir -p ./rna_input/filtered_feature_bc_matrix + cp -r temp_sc_rna/* ./rna_input/filtered_feature_bc_matrix/ + echo "Importing RNA data" + cbImportCellranger -i rna_input -o rna --name rna + cd ./rna echo "Copying coordinates files" - cp ../gex_input/analysis/tsne/2_components/projection.csv tsne.coords.csv - cp ../gex_input/analysis/umap/2_components/projection.csv umap.coords.csv + cp ../rna_input/analysis/tsne/2_components/projection.csv tsne.coords.csv + cp ../rna_input/analysis/umap/2_components/projection.csv umap.coords.csv echo "Replacing configuration files" rm -f cellbrowser.conf desc.conf - cp ../cellbrowser_gex.conf cellbrowser.conf - cp ../desc_gex.conf desc.conf - if [[ -n $2 ]]; then + cp ../cellbrowser_rna.conf cellbrowser.conf + cp ../desc_rna.conf desc.conf + if [[ -n $3 ]]; then echo "Aggregation metadata file was provided. Adding initial cell identity classes" - cat $2 | grep -v "library_id" | awk '{print NR","$0}' > aggregation_metadata.csv + cat $3 | grep -v "library_id" | awk '{print NR","$0}' > aggregation_metadata.csv cat meta.csv | grep -v "Barcode" > meta_headerless.csv - echo "Barcode,Cluster,Identity" > meta.csv + echo "Barcode,Cluster,Dataset" > meta.csv awk -F, 'NR==FNR {identity[$1]=$2; next} {split($1,barcode,"-"); print $0","identity[barcode[2]]}' aggregation_metadata.csv meta_headerless.csv >> meta.csv rm -f aggregation_metadata.csv meta_headerless.csv fi + cbBuild -o ../html_data cd .. - echo "Building" - cbBuild -r -o html_data echo "Cleaning up temporary files" - rm -rf gex_input atac_input atac gex + rm -rf rna_input atac_input atac rna temp_sc_rna temp_sc_atac inputBinding: position: 5 doc: | - Bash script to run cbImportCellranger and cbBuild commands + Bash script to run cbImportCellranger + and cbBuild commands. secondary_analysis_report_folder: type: Directory inputBinding: position: 6 doc: | - Folder with secondary analysis results including dimensionality reduction, - cell clustering, and differential expression produced by Cellranger ARC - Count or Cellranger ARC Aggr + Folder with secondary analysis results + including dimensionality reduction, cell + clustering, and differential expression + produced by Cellranger ARC Count or + Cellranger ARC Aggr. filtered_feature_bc_matrix_folder: type: Directory inputBinding: position: 7 doc: | - Folder with filtered feature-barcode matrices containing only cellular - barcodes in MEX format produced by Cellranger ARC Count or Cellranger ARC Aggr + Folder with filtered feature-barcode + matrices containing only cellular + barcodes in MEX format produced by + Cellranger ARC Count or Cellranger + ARC Aggr. + + annotation_gtf_file: + type: File + inputBinding: + position: 8 + doc: | + GTF annotation file. aggregation_metadata: type: File? inputBinding: - position: 8 + position: 9 doc: | - Cellranger aggregation CSV file. If provided, the Identity metadata - column will be added to the meta.csv + Cellranger aggregation CSV file. If + provided, the Dataset metadata column + will be added to the meta.csv. outputs: @@ -270,7 +294,6 @@ s:creator: doc: | Cell Ranger ARC Count/Aggregate to UCSC Cell Browser - ===================================================== Exports clustering results from Cell Ranger ARC Count Chromatin Accessibility and Gene Expression or Cell @@ -294,12 +317,14 @@ s:about: | -m, --noMat do not export the matrix again, saves some time if you changed something small since the last run - Usage: cbBuild [options] -i cellbrowser.conf -o outputDir - add a dataset to the single cell viewer directory If you have previously built into the same output directory with the same dataset and the expression matrix has not changed its filesize, this will be detected and the expression matrix will not be copied again. This means that an update of a few meta data attributes is quite quick. + Gene symbol/annotation files are downloaded to ~/cellbrowserData when + needed. Config defaults can be specified in ~/.cellbrowser. See + documentation at https://cellbrowser.readthedocs.io/ Options: -h, --help show this help message and exit --init copy sample cellbrowser.conf and desc.conf to current @@ -311,11 +336,14 @@ s:about: | specified multiple times -o OUTDIR, --outDir=OUTDIR output directory, default can be set through the env. - variable CBOUT or ~/.cellbrowser.conf, current value: - none + variable CBOUT or ~/.cellbrowser, current value: none -p PORT, --port=PORT if build is successful, start an http server on this port and serve the result via http://localhost:port -r, --recursive run in all subdirectories of the current directory. - Useful when rebuilding a full hierarchy. + Useful when rebuilding a full hierarchy. Cannot be + used with -p. + --depth=DEPTH when using -r: only go this many directories deep --redo=REDO do not use cached old data. Can be: 'meta' or 'matrix' - (matrix includes meta). \ No newline at end of file + (matrix includes meta). + --force ignore errors that usually stop the build and go ahead + anyways. \ No newline at end of file diff --git a/tools/cellbrowser-build-cellranger-atac.cwl b/tools/cellbrowser-build-cellranger-atac.cwl index 9f716d9d..9f534536 100644 --- a/tools/cellbrowser-build-cellranger-atac.cwl +++ b/tools/cellbrowser-build-cellranger-atac.cwl @@ -4,7 +4,7 @@ class: CommandLineTool hints: - class: DockerRequirement - dockerPull: biowardrobe2/cellbrowser:v0.0.2 + dockerPull: biowardrobe2/sc-tools:v0.0.39 requirements: @@ -14,13 +14,12 @@ requirements: - entryname: cellbrowser.conf entry: | name = "ATAC" - shortLabel="ATAC" + shortLabel = "ATAC" priority = 1 - geneIdType="auto" - geneLabel="Feature" - exprMatrix="exprMatrix.tsv.gz" - meta="meta.csv" - coords=[ + geneIdType = "auto" + exprMatrix = "exprMatrix.tsv.gz" + meta = "meta.csv" + coords = [ { "file": "tsne.coords.csv", "shortLabel": "t-SNE" @@ -35,14 +34,17 @@ requirements: } ] markers=[ - { - "file":"markers.tsv", - "shortLabel":"Cluster-specific peaks" - } + { + "file": "markers.tsv", + "shortLabel": "Cluster-specific peaks" + } ] - enumFields = ["Barcode"] - clusterField="Cluster" - labelField="Cluster" + geneLabel = "Feature" + radius = 3 + alpha = 0.5 + clusterField = "Cluster" + labelField = "Cluster" + atacSearch = "genome.current" - entryname: desc.conf entry: | title = "ATAC" @@ -58,17 +60,17 @@ inputs: type: string? default: | #!/bin/bash + echo "Preparing ATAC search file" + sc_cb_utils_atac_search.R --annotations $2 echo "Prepare input data" mkdir -p ./cellbrowser_input/analysis/clustering/graphclust \ ./cellbrowser_input/analysis/diffexp/graphclust \ ./cellbrowser_input/filtered_feature_bc_matrix - cp -r $0/clustering/graphclust/clusters.csv ./cellbrowser_input/analysis/clustering/graphclust/clusters.csv cp -r $0/enrichment/graphclust/differential_expression.csv ./cellbrowser_input/analysis/diffexp/graphclust/differential_expression.csv cp -r $0/tsne ./cellbrowser_input/analysis/ cp -r $0/umap ./cellbrowser_input/analysis/ cp -r $0/lsa ./cellbrowser_input/analysis/ - cp -r $1/* ./cellbrowser_input/filtered_feature_bc_matrix/ cd ./cellbrowser_input/filtered_feature_bc_matrix/ gzip barcodes.tsv @@ -77,7 +79,6 @@ inputs: gzip features.tsv rm -f peaks.bed cd - - echo "Run cbImportCellranger" cbImportCellranger -i cellbrowser_input -o cellbrowser_output --name cellbrowser cd ./cellbrowser_output @@ -85,16 +86,15 @@ inputs: cp ../cellbrowser_input/analysis/tsne/*/projection.csv tsne.coords.csv cp ../cellbrowser_input/analysis/umap/*/projection.csv umap.coords.csv cp ../cellbrowser_input/analysis/lsa/*/projection.csv lsa.coords.csv - echo "Replace configuration files" rm -f cellbrowser.conf desc.conf cp ../cellbrowser.conf . cp ../desc.conf . - if [[ -n $2 ]]; then + if [[ -n $3 ]]; then echo "Aggregation metadata file was provided. Adding initial cell identity classes" - cat $2 | grep -v "library_id" | awk '{print NR","$0}' > aggregation_metadata.csv + cat $3 | grep -v "library_id" | awk '{print NR","$0}' > aggregation_metadata.csv cat meta.csv | grep -v "Barcode" > meta_headerless.csv - echo "Barcode,Cluster,Identity" > meta.csv + echo "Barcode,Cluster,Dataset" > meta.csv awk -F, 'NR==FNR {identity[$1]=$2; next} {split($1,barcode,"-"); print $0","identity[barcode[2]]}' aggregation_metadata.csv meta_headerless.csv >> meta.csv rm -f aggregation_metadata.csv meta_headerless.csv fi @@ -103,30 +103,41 @@ inputs: inputBinding: position: 5 doc: | - Bash script to run cbImportCellranger and cbBuild commands + Bash script to run cbImportCellranger + and cbBuild commands. secondary_analysis_report_folder: type: Directory inputBinding: position: 6 doc: | - Folder with secondary analysis results + Folder with secondary + analysis results. filtered_feature_bc_matrix_folder: type: Directory inputBinding: position: 7 doc: | - Folder with filtered peak-barcode matrices containing only - cellular barcodes in MEX format + Folder with filtered peak-barcode matrices + containing only cellular barcodes + in MEX format. + + annotation_gtf_file: + type: File + inputBinding: + position: 8 + doc: | + GTF annotation file. aggregation_metadata: type: File? inputBinding: - position: 8 + position: 9 doc: | - Cellranger aggregation CSV file. If provided, the Identity metadata - column will be added to the meta.csv + Cellranger aggregation CSV file. If + provided, the Dataset metadata column + will be added to the meta.csv. outputs: @@ -226,12 +237,14 @@ s:about: | -m, --noMat do not export the matrix again, saves some time if you changed something small since the last run - Usage: cbBuild [options] -i cellbrowser.conf -o outputDir - add a dataset to the single cell viewer directory If you have previously built into the same output directory with the same dataset and the expression matrix has not changed its filesize, this will be detected and the expression matrix will not be copied again. This means that an update of a few meta data attributes is quite quick. + Gene symbol/annotation files are downloaded to ~/cellbrowserData when + needed. Config defaults can be specified in ~/.cellbrowser. See + documentation at https://cellbrowser.readthedocs.io/ Options: -h, --help show this help message and exit --init copy sample cellbrowser.conf and desc.conf to current @@ -243,11 +256,14 @@ s:about: | specified multiple times -o OUTDIR, --outDir=OUTDIR output directory, default can be set through the env. - variable CBOUT or ~/.cellbrowser.conf, current value: - none + variable CBOUT or ~/.cellbrowser, current value: none -p PORT, --port=PORT if build is successful, start an http server on this port and serve the result via http://localhost:port -r, --recursive run in all subdirectories of the current directory. - Useful when rebuilding a full hierarchy. + Useful when rebuilding a full hierarchy. Cannot be + used with -p. + --depth=DEPTH when using -r: only go this many directories deep --redo=REDO do not use cached old data. Can be: 'meta' or 'matrix' - (matrix includes meta). \ No newline at end of file + (matrix includes meta). + --force ignore errors that usually stop the build and go ahead + anyways. \ No newline at end of file diff --git a/tools/cellbrowser-build-cellranger.cwl b/tools/cellbrowser-build-cellranger.cwl index 01a414bd..9e23a627 100644 --- a/tools/cellbrowser-build-cellranger.cwl +++ b/tools/cellbrowser-build-cellranger.cwl @@ -4,7 +4,7 @@ class: CommandLineTool hints: - class: DockerRequirement - dockerPull: biowardrobe2/cellbrowser:v0.0.2 + dockerPull: biowardrobe2/sc-tools:v0.0.39 requirements: @@ -13,13 +13,13 @@ requirements: listing: - entryname: cellbrowser.conf entry: | - name = "cellbrowser" - shortLabel="cellbrowser" + name = "RNA" + shortLabel = "RNA" priority = 1 - geneIdType="auto" - exprMatrix="exprMatrix.tsv.gz" - meta="meta.csv" - coords=[ + geneIdType = "auto" + exprMatrix = "exprMatrix.tsv.gz" + meta = "meta.csv" + coords = [ { "file": "tsne.coords.csv", "shortLabel": "CellRanger t-SNE" @@ -29,18 +29,20 @@ requirements: "shortLabel": "CellRanger UMAP" } ] - markers=[ - { - "file":"markers.tsv", - "shortLabel":"Cluster-specific genes" - } + markers = [ + { + "file": "markers.tsv", + "shortLabel": "Cluster-specific genes" + } ] - enumFields = ["Barcode"] - clusterField="Cluster" - labelField="Cluster" + geneLabel = "Feature" + radius = 3 + alpha = 0.5 + clusterField = "Cluster" + labelField = "Cluster" - entryname: desc.conf entry: | - title = "CellBrowser" + title = "RNA" abstract = "" methods = "" biorxiv_url = "" @@ -70,9 +72,9 @@ inputs: cp ../desc.conf . if [[ -n $2 ]]; then echo "Aggregation metadata file was provided. Adding initial cell identity classes" - cat $2 | grep -v "library_id" | awk '{print NR","$0}' > aggregation_metadata.csv + cat $2 | grep -v "sample_id" | awk '{print NR","$0}' > aggregation_metadata.csv cat meta.csv | grep -v "Barcode" > meta_headerless.csv - echo "Barcode,Cluster,Identity" > meta.csv + echo "Barcode,Cluster,Dataset" > meta.csv awk -F, 'NR==FNR {identity[$1]=$2; next} {split($1,barcode,"-"); print $0","identity[barcode[2]]}' aggregation_metadata.csv meta_headerless.csv >> meta.csv rm -f aggregation_metadata.csv meta_headerless.csv fi @@ -81,32 +83,38 @@ inputs: inputBinding: position: 5 doc: | - Bash script to run cbImportCellranger and cbBuild commands + Bash script to run cbImportCellranger + and cbBuild commands. secondary_analysis_report_folder: type: Directory inputBinding: position: 6 doc: | - Folder with secondary analysis results including dimensionality reduction, - cell clustering, and differential expression produced by Cellranger Count - or Cellranger Aggr + Folder with secondary analysis results + including dimensionality reduction, cell + clustering, and differential expression + produced by Cellranger Count or Cellranger + Aggr. filtered_feature_bc_matrix_folder: type: Directory inputBinding: position: 7 doc: | - Folder with filtered feature-barcode matrices containing only cellular - barcodes in MEX format produced by Cellranger Count or Cellranger Aggr + Folder with filtered feature-barcode + matrices containing only cellular + barcodes in MEX format produced by + Cellranger Count or Cellranger Aggr. aggregation_metadata: type: File? inputBinding: position: 8 doc: | - Cellranger aggregation CSV file. If provided, the Identity metadata - column will be added to the meta.csv + Cellranger aggregation CSV file. If + provided, the Dataset metadata column + will be added to the meta.csv. outputs: @@ -184,8 +192,7 @@ s:creator: doc: | Cell Ranger Count/Aggregate to UCSC Cell Browser - ================================================================= - + Exports clustering results from Cell Ranger Count Gene Expression and Cell Ranger Aggregate experiments into compatible with UCSC Cell Browser format. @@ -207,12 +214,14 @@ s:about: | -m, --noMat do not export the matrix again, saves some time if you changed something small since the last run - Usage: cbBuild [options] -i cellbrowser.conf -o outputDir - add a dataset to the single cell viewer directory If you have previously built into the same output directory with the same dataset and the expression matrix has not changed its filesize, this will be detected and the expression matrix will not be copied again. This means that an update of a few meta data attributes is quite quick. + Gene symbol/annotation files are downloaded to ~/cellbrowserData when + needed. Config defaults can be specified in ~/.cellbrowser. See + documentation at https://cellbrowser.readthedocs.io/ Options: -h, --help show this help message and exit --init copy sample cellbrowser.conf and desc.conf to current @@ -224,11 +233,14 @@ s:about: | specified multiple times -o OUTDIR, --outDir=OUTDIR output directory, default can be set through the env. - variable CBOUT or ~/.cellbrowser.conf, current value: - none + variable CBOUT or ~/.cellbrowser, current value: none -p PORT, --port=PORT if build is successful, start an http server on this port and serve the result via http://localhost:port -r, --recursive run in all subdirectories of the current directory. - Useful when rebuilding a full hierarchy. + Useful when rebuilding a full hierarchy. Cannot be + used with -p. + --depth=DEPTH when using -r: only go this many directories deep --redo=REDO do not use cached old data. Can be: 'meta' or 'matrix' - (matrix includes meta). \ No newline at end of file + (matrix includes meta). + --force ignore errors that usually stop the build and go ahead + anyways. \ No newline at end of file diff --git a/tools/cellranger-aggr.cwl b/tools/cellranger-aggr.cwl index 32a8153a..b45a7fc9 100644 --- a/tools/cellranger-aggr.cwl +++ b/tools/cellranger-aggr.cwl @@ -55,7 +55,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: cumulusprod/cellranger:7.0.0 + dockerPull: cumulusprod/cellranger:8.0.1 inputs: @@ -176,14 +176,14 @@ outputs: outputBinding: glob: "aggregated/outs/count/summary.json" doc: | - Aggregated GEX run summary metrics in JSON format + Aggregated RNA run summary metrics in JSON format secondary_analysis_report_folder: type: Directory outputBinding: glob: "aggregated/outs/count/analysis" doc: | - Folder with secondary analysis of GEX data including dimensionality reduction, + Folder with secondary analysis of RNA data including dimensionality reduction, cell clustering, and differential expression filtered_feature_bc_matrix_folder: @@ -223,6 +223,15 @@ outputs: doc: | Loupe Browser visualization and analysis file + airr_rearrangement_tsv: + type: File? + outputBinding: + glob: "aggregated/outs/vdj_*/airr_rearrangement.tsv" + doc: | + Annotated contigs and consensus sequences of V(D)J + rearrangements in the AIRR format. It includes only + viable cells identified by both V(D)J and RNA algorithms. + clonotypes_csv: type: File? outputBinding: @@ -280,11 +289,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cell Ranger Aggregate" -s:name: "Cell Ranger Aggregate" -s:alternateName: | - Aggregates outputs from multiple runs of Cell Ranger Count Gene Expression or - Cell Ranger Multi Gene Expression and V(D)J Repertoire Profiling experiments +label: "Cell Ranger Aggregate (RNA, RNA+VDJ)" +s:name: "Cell Ranger Aggregate (RNA, RNA+VDJ)" +s:alternateName: "Combines outputs from multiple runs of either Cell Ranger Count (RNA) or Cell Ranger Count (RNA+VDJ) pipelines" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-aggr.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -322,14 +329,13 @@ s:creator: doc: | - Cell Ranger Aggregate + Cell Ranger Aggregate (RNA, RNA+VDJ) - Aggregates outputs from multiple runs of Cell Ranger Count Gene - Expression (if molecule_info_h5 input provided) or Cell Ranger - Multi Gene Expression and V(D)J Repertoire Profiling experiments - (if filtered_data_folder input provided). If both inputs are - provided - use molecule_info_h5. If neither of them was provided - cellranger aggr will fail. + Aggregates outputs from multiple runs of the "Cell Ranger Count + (RNA)" (if molecule_info_h5 input provided) or "Cell Ranger Count + (RNA+VDJ)" experiments (if filtered_data_folder input provided). + If both inputs are provided - uses molecule_info_h5. If neither of + them are provided cellranger aggr will fail. Parameters set by default: --disable-ui - no need in any UI when running in Docker container @@ -339,6 +345,7 @@ doc: | Skipped parameters: --nosecondary --dry + --min-crispr-umi --noexit --nopreflight --description @@ -347,9 +354,10 @@ doc: | --maxjobs --jobinterval --overrides + --output-dir --uiport - Not supported features when aggregating GEX experiments: + Not supported features when aggregating RNA experiments: - Batch correction caused by different versions of the Single Cell Gene Expression chemistry is not supported as the generated metadata file for merging molecule_info_h5 inputs doesn't include "batch" field. @@ -358,29 +366,32 @@ doc: | s:about: | Aggregate data from multiple Cell Ranger runs - USAGE: - cellranger aggr [OPTIONS] --id --csv - - OPTIONS: - --id A unique run id and output folder name [a-zA-Z0-9_-]+ - --description Sample description to embed in output files [default: ] - --csv Path of CSV file enumerating 'cellranger count/vdj/multi' outputs - --normalize Library depth normalization mode [default: mapped] [possible values: mapped, none] - --nosecondary Disable secondary analysis, e.g. clustering - --dry Do not execute the pipeline. Generate a pipeline invocation (.mro) file and stop - --jobmode Job manager to use. Valid options: local (default), sge, lsf, slurm or path to a .template file. Search for help on "Cluster Mode" at - support.10xgenomics.com for more details on configuring the pipeline to use a compute cluster [default: local] - --localcores Set max cores the pipeline may request at one time. Only applies to local jobs - --localmem Set max GB the pipeline may request at one time. Only applies to local jobs - --localvmem Set max virtual address space in GB for the pipeline. Only applies to local jobs - --mempercore Reserve enough threads for each job to ensure enough memory will be available, assuming each core on your cluster has at least this much memory - available. Only applies to cluster jobmodes - --maxjobs Set max jobs submitted to cluster at one time. Only applies to cluster jobmodes - --jobinterval Set delay between submitting jobs to cluster, in ms. Only applies to cluster jobmodes - --overrides The path to a JSON file that specifies stage-level overrides for cores and memory. Finer-grained than --localcores, --mempercore and --localmem. - Consult https://support.10xgenomics.com/ for an example override file - --uiport Serve web UI at http://localhost:PORT - --disable-ui Do not serve the web UI - --noexit Keep web UI running after pipestance completes or fails - --nopreflight Skip preflight checks - -h, --help Print help information \ No newline at end of file + Usage: cellranger aggr [OPTIONS] --id --csv + + Options: + --id A unique run id and output folder name [a-zA-Z0-9_-]+ + --description Sample description to embed in output files [default: ] + --csv Path of CSV file enumerating 'cellranger count/vdj/multi' outputs + --normalize Library depth normalization mode [default: mapped] [possible values: mapped, none] + --nosecondary Disable secondary analysis, e.g. clustering + --dry Do not execute the pipeline. Generate a pipeline invocation (.mro) file and stop + --min-crispr-umi Minimum CRISPR UMI threshold [default: 3] + --jobmode Job manager to use. Valid options: local (default), sge, lsf, slurm or path to a .template file. + Search for help on "Cluster Mode" at support.10xgenomics.com for more details on configuring the + pipeline to use a compute cluster + --localcores Set max cores the pipeline may request at one time. Only applies to local jobs + --localmem Set max GB the pipeline may request at one time. Only applies to local jobs + --localvmem Set max virtual address space in GB for the pipeline. Only applies to local jobs + --mempercore Reserve enough threads for each job to ensure enough memory will be available, assuming each core + on your cluster has at least this much memory available. Only applies to cluster jobmodes + --maxjobs Set max jobs submitted to cluster at one time. Only applies to cluster jobmodes + --jobinterval Set delay between submitting jobs to cluster, in ms. Only applies to cluster jobmodes + --overrides The path to a JSON file that specifies stage-level overrides for cores and memory. Finer-grained + than --localcores, --mempercore and --localmem. Consult https://support.10xgenomics.com/ for an + example override file + --output-dir Output the results to this directory + --uiport Serve web UI at http://localhost:PORT + --disable-ui Do not serve the web UI + --noexit Keep web UI running after pipestance completes or fails + --nopreflight Skip preflight checks + -h, --help Print help \ No newline at end of file diff --git a/tools/cellranger-arc-aggr.cwl b/tools/cellranger-arc-aggr.cwl index 608d57aa..6f6e82f5 100644 --- a/tools/cellranger-arc-aggr.cwl +++ b/tools/cellranger-arc-aggr.cwl @@ -6,8 +6,8 @@ requirements: - class: InlineJavascriptRequirement expressionLib: - var get_label = function(i) { - var rootname = inputs.gex_molecule_info_h5[i].basename.split('.').slice(0,-1).join('.'); - rootname = (rootname=="")?inputs.gex_molecule_info_h5[i].basename:rootname; + var rootname = inputs.rna_molecule_info_h5[i].basename.split('.').slice(0,-1).join('.'); + rootname = (rootname=="")?inputs.rna_molecule_info_h5[i].basename:rootname; return inputs.gem_well_labels?inputs.gem_well_labels[i].replace(/\t|\s|\[|\]|\>|\<|,|\./g, "_"):rootname; }; - class: InitialWorkDirRequirement @@ -15,8 +15,8 @@ requirements: ${ var entry = "library_id,atac_fragments,per_barcode_metrics,gex_molecule_info\n" var grouping = "library_id\tcondition\n" - for (var i=0; i < inputs.gex_molecule_info_h5.length; i++){ - entry += get_label(i) + "," + inputs.atac_fragments_file_from_count[i].path + "," + inputs.barcode_metrics_report[i].path + "," + inputs.gex_molecule_info_h5[i].path + "\n" + for (var i=0; i < inputs.rna_molecule_info_h5.length; i++){ + entry += get_label(i) + "," + inputs.atac_fragments_file_from_count[i].path + "," + inputs.barcode_metrics_report[i].path + "," + inputs.rna_molecule_info_h5[i].path + "\n" grouping += get_label(i) + "\t" + get_label(i) + "\n" } return [ @@ -44,30 +44,37 @@ inputs: secondaryFiles: - .tbi doc: | - Array of files containing count and barcode information for every ATAC - fragment observed in the experiment in TSV format. Outputs from + Array of files containing count and + barcode information for every ATAC + fragment observed in the experiment + in TSV format. Outputs from the "cellranger-arc count" command. barcode_metrics_report: type: File[] doc: | - Array of files with the ATAC and GEX read count summaries generated for every - barcode observed in the experiment. Outputs from "cellranger-arc count" command. + Array of files with the ATAC and RNA + read count summaries generated for + every barcode observed in the + experiment. Outputs from the + "cellranger-arc count" command. - gex_molecule_info_h5: + rna_molecule_info_h5: type: File[] doc: | - Array of GEX molecule-level information files in HDF5 format. - Outputs from "cellranger-arc count" command. + Array of RNA molecule-level information + files in HDF5 format. Outputs from the + "cellranger-arc count" command. gem_well_labels: type: - "null" - string[] doc: | - Array of GEM well identifiers to be used for labeling purposes only. - If not provided use rootnames of files from the gex_molecule_info_h5 - input + Array of GEM well identifiers to be + used for labeling purposes only. If + not provided use rootnames of files + from the rna_molecule_info_h5 input. indices_folder: type: Directory @@ -75,9 +82,10 @@ inputs: position: 5 prefix: "--reference" doc: | - Compatible with Cell Ranger ARC reference folder that includes - STAR and BWA indices. Should be generated by "cellranger-arc mkref" - command + Compatible with Cell Ranger ARC reference + folder that includes STAR and BWA indices. + Should be generated by the + "cellranger-arc mkref" command. normalization_mode: type: @@ -89,7 +97,8 @@ inputs: position: 6 prefix: "--normalize" doc: | - Library depth normalization mode: depth, none. + Library depth normalization + mode: depth, none. Default: depth threads: @@ -98,7 +107,8 @@ inputs: position: 7 prefix: "--localcores" doc: | - Set max cores the pipeline may request at one time. + Set max cores the pipeline may + request at one time. Default: all available memory_limit: @@ -107,7 +117,8 @@ inputs: position: 8 prefix: "--localmem" doc: | - Set max GB the pipeline may request at one time + Set max GB the pipeline may + request at one time. Default: all available virt_memory_limit: @@ -116,7 +127,8 @@ inputs: position: 9 prefix: "--localvmem" doc: | - Set max virtual address space in GB for the pipeline + Set max virtual address space + in GB for the pipeline. Default: all available @@ -127,14 +139,16 @@ outputs: outputBinding: glob: "aggregated/outs/web_summary.html" doc: | - Aggregated run summary metrics and charts in HTML format + Aggregated run summary metrics + and charts in HTML format. metrics_summary_report: type: File outputBinding: glob: "aggregated/outs/summary.csv" doc: | - Aggregated run summary metrics in CSV format + Aggregated run summary metrics + in CSV format. atac_fragments_file: type: File @@ -143,86 +157,101 @@ outputs: secondaryFiles: - .tbi doc: | - Count and barcode information for every ATAC fragment observed in the - aggregated experiment in TSV format + Count and barcode information for + every ATAC fragment observed in the + aggregated experiment in TSV format. atac_peaks_bed_file: type: File outputBinding: glob: "aggregated/outs/atac_peaks.bed" doc: | - Locations of open-chromatin regions identified in aggregated experiment - (these regions are referred to as "peaks") + Locations of open-chromatin regions + identified in aggregated experiment + (these regions are referred to as + "peaks"). atac_peak_annotation_file: type: File outputBinding: glob: "aggregated/outs/atac_peak_annotation.tsv" doc: | - Annotations of peaks based on genomic proximity alone (for aggregated - experiment). Note that these are not functional annotations and they - do not make use of linkage with GEX data. + Annotations of peaks based on + genomic proximity alone (for + aggregated experiment). Note + that these are not functional + annotations and they do not + make use of linkage with RNA + data. secondary_analysis_report_folder: type: Directory outputBinding: glob: "aggregated/outs/analysis" doc: | - Folder with secondary analysis results including dimensionality reduction, - cell clustering, and differential expression for aggregated results + Folder with secondary analysis results + including dimensionality reduction, cell + clustering, and differential expression + for aggregated results. filtered_feature_bc_matrix_folder: type: Directory outputBinding: glob: "aggregated/outs/filtered_feature_bc_matrix" doc: | - Folder with aggregated filtered feature-barcode matrices containing only - cellular barcodes in MEX format + Folder with aggregated filtered + feature-barcode matrices containing + only cellular barcodes in MEX format. filtered_feature_bc_matrix_h5: type: File outputBinding: glob: "aggregated/outs/filtered_feature_bc_matrix.h5" doc: | - Aggregated filtered feature-barcode matrices containing only cellular barcodes - in HDF5 format + Aggregated filtered feature-barcode + matrices containing only cellular + barcodes in HDF5 format. raw_feature_bc_matrices_folder: type: Directory outputBinding: glob: "aggregated/outs/raw_feature_bc_matrix" doc: | - Folder with aggregated unfiltered feature-barcode matrices containing all barcodes - in MEX format + Folder with aggregated unfiltered + feature-barcode matrices containing + all barcodes in MEX format. raw_feature_bc_matrices_h5: type: File outputBinding: glob: "aggregated/outs/raw_feature_bc_matrix.h5" doc: | - Aggregated unfiltered feature-barcode matrices containing all barcodes - in HDF5 format + Aggregated unfiltered feature-barcode + matrices containing all barcodes in + HDF5 format. aggregation_metadata: type: File outputBinding: glob: "aggregated/outs/aggr.csv" doc: | - Copy of the input aggregation CSV file + Copy of the input aggregation CSV file. grouping_data: type: File outputBinding: glob: "grouping.tsv" doc: | - Example of TSV file to define datasets grouping + Example of TSV file to define + datasets grouping. loupe_browser_track: type: File outputBinding: glob: "aggregated/outs/cloupe.cloupe" doc: | - Loupe Browser visualization and analysis file for aggregated results + Loupe Browser visualization and + analysis file for aggregated results. stdout_log: type: stdout @@ -244,9 +273,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cellranger ARC Aggregate" -s:name: "Cellranger ARC Aggregate" -s:alternateName: "Aggregates outputs from multiple runs of Cell Ranger ARC Count Chromatin Accessibility and Gene Expression" +label: "Cell Ranger Aggregate (RNA+ATAC)" +s:name: "Cell Ranger Aggregate (RNA+ATAC)" +s:alternateName: "Combines outputs from multiple runs of Cell Ranger Count (RNA+ATAC) pipeline" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-arc-aggr.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -284,11 +313,10 @@ s:creator: doc: | - Cellranger ARC Aggregate - ======================================================================== + Cell Ranger Aggregate (RNA+ATAC) - Aggregates outputs from multiple runs of Cell Ranger ARC Count Chromatin - Accessibility and Gene Expression. + Combines outputs from multiple runs of “Cell Ranger Count (RNA+ATAC)” + pipeline. Parameters set by default: --disable-ui - no need in any UI when running in Docker container diff --git a/tools/cellranger-arc-count.cwl b/tools/cellranger-arc-count.cwl index d302012a..3d8a18fa 100644 --- a/tools/cellranger-arc-count.cwl +++ b/tools/cellranger-arc-count.cwl @@ -9,13 +9,13 @@ requirements: ${ var listing = [ { - "entry": inputs.gex_fastq_file_r1, - "entryname": "gex_S1_L001_R1_001.fastq", + "entry": inputs.rna_fastq_file_r1, + "entryname": "rna_S1_L001_R1_001.fastq", "writable": true }, { - "entry": inputs.gex_fastq_file_r2, - "entryname": "gex_S1_L001_R2_001.fastq", + "entry": inputs.rna_fastq_file_r2, + "entryname": "rna_S1_L001_R2_001.fastq", "writable": true }, { @@ -35,25 +35,25 @@ requirements: }, { "entry":`fastqs,sample,library_type - ${runtime.outdir},gex,Gene Expression + ${runtime.outdir},rna,Gene Expression ${runtime.outdir},atac,Chromatin Accessibility`, "entryname": "libraries.csv" } ] - if (inputs.gex_fastq_file_i1){ + if (inputs.rna_fastq_file_i1){ listing.push( { - "entry": inputs.gex_fastq_file_i1, - "entryname": "gex_S1_L001_I1_001.fastq", + "entry": inputs.rna_fastq_file_i1, + "entryname": "rna_S1_L001_I1_001.fastq", "writable": true } ); }; - if (inputs.gex_fastq_file_i2){ + if (inputs.rna_fastq_file_i2){ listing.push( { - "entry": inputs.gex_fastq_file_i2, - "entryname": "gex_S1_L001_I2_001.fastq", + "entry": inputs.rna_fastq_file_i2, + "entryname": "rna_S1_L001_I2_001.fastq", "writable": true } ); @@ -78,45 +78,63 @@ hints: inputs: - gex_fastq_file_r1: + rna_fastq_file_r1: type: File doc: | - GEX FASTQ read 1 file (will be staged into workdir as gex_S1_L001_R1_001.fastq) + RNA FASTQ read 1 file. + It will be staged into workdir + as rna_S1_L001_R1_001.fastq. - gex_fastq_file_r2: + rna_fastq_file_r2: type: File doc: | - GEX FASTQ read 2 file (will be staged into workdir as gex_S1_L001_R2_001.fastq) + RNA FASTQ read 2 file. + It will be staged into workdir + as rna_S1_L001_R2_001.fastq. - gex_fastq_file_i1: + rna_fastq_file_i1: type: File? doc: | - GEX FASTQ index i7 file (will be staged into workdir as gex_S1_L001_I1_001.fastq) + RNA FASTQ index i7 file. + It will be staged into workdir + as rna_S1_L001_I1_001.fastq. - gex_fastq_file_i2: + rna_fastq_file_i2: type: File? doc: | - GEX FASTQ index i5 file (will be staged into workdir as gex_S1_L001_I2_001.fastq) + RNA FASTQ index i5 file. + It will be staged into workdir + as rna_S1_L001_I2_001.fastq. atac_fastq_file_r1: type: File doc: | - ATAC FASTQ read 1 file (will be staged into workdir as atac_S1_L001_R1_001.fastq) + ATAC FASTQ read 1 file. + It will be staged into workdir + as atac_S1_L001_R1_001.fastq. atac_fastq_file_r2: type: File doc: | - ATAC FASTQ read 2 (it's actually index i5) file (will be staged into workdir as atac_S1_L001_R2_001.fastq) + ATAC FASTQ read 2 file. + Alternative name is index i5 file. + It will be staged into workdir + as atac_S1_L001_R2_001.fastq. atac_fastq_file_r3: type: File doc: | - ATAC FASTQ read 3 (it's actually read 2) file (will be staged into workdir as atac_S1_L001_R3_001.fastq) + ATAC FASTQ read 3 file. + Alternative name is read 2 file. + It will be staged into workdir + as atac_S1_L001_R3_001.fastq. atac_fastq_file_i1: type: File? doc: | - ATAC FASTQ index i7 file (will be staged into workdir as atac_S1_L001_I1_001.fastq) + ATAC FASTQ index i7 file. + It will be staged into workdir + as atac_S1_L001_I1_001.fastq. indices_folder: type: Directory @@ -124,9 +142,11 @@ inputs: position: 5 prefix: "--reference" doc: | - Compatible with Cell Ranger ARC reference folder that includes - STAR and BWA indices. Should be generated by "cellranger-arc mkref" - command + Compatible with Cell Ranger ARC + reference folder that includes + STAR and BWA indices. Should be + generated by "cellranger-arc + mkref" command. exclude_introns: type: boolean? @@ -134,9 +154,13 @@ inputs: position: 6 prefix: "--gex-exclude-introns" doc: | - Disable counting of intronic reads. In this mode, only reads that are exonic - and compatible with annotated splice junctions in the reference are counted. - Note: using this mode will reduce the UMI counts in the feature-barcode matrix + Disable counting of intronic reads. + In this mode, only reads that are + exonic and compatible with annotated + splice junctions in the reference are + counted. Note: using this mode will + reduce the UMI counts in the + feature-barcode matrix. force_min_atac_counts: type: int? @@ -144,22 +168,24 @@ inputs: position: 7 prefix: "--min-atac-count" doc: | - Cell caller override: define the minimum number of ATAC transposition events - in peaks (ATAC counts) for a cell barcode. - Note: this option must be specified in conjunction with `--min-gex-count`. - With `--min-atac-count=X` and `--min-gex-count=Y` a barcode is defined as a cell - if it contains at least X ATAC counts AND at least Y GEX UMI counts + Cell caller override: define the minimum number of RNA UMI + counts for a cell barcode. Note: this option must be + specified in conjunction with `--min-atac-count`. With + `--min-atac-count=X` and `--min-gex-count=Y`, a barcode is + defined as a cell if it contains at least X ATAC counts + AND at least Y RNA UMI counts. - force_min_gex_counts: + force_min_rna_counts: type: int? inputBinding: position: 8 prefix: "--min-gex-count" doc: | - Cell caller override: define the minimum number of GEX UMI counts for a cell barcode. - Note: this option must be specified in conjunction with `--min-atac-count`. - With `--min-atac-count=X` and `--min-gex-count=Y` a barcode is defined as a cell - if it contains at least X ATAC counts AND at least Y GEX UMI counts + Cell caller override: define the minimum number of RNA UMI + counts for a cell barcode. Note: this option must be + specified with `--min-atac-count`. With `--min-atac-count=X` + and `--min-gex-count=Y`, a barcode is defined as a cell if + it has at least X ATAC counts AND Y RNA UMI counts. force_peaks_bed_file: type: File? @@ -167,9 +193,11 @@ inputs: position: 9 prefix: "--peaks" doc: | - Peak caller override: specify peaks to use in downstream analyses from supplied 3-column BED file. - The supplied peaks file must be sorted by position and not contain overlapping peaks; - comment lines beginning with `#` are allowed + Peak caller override: specify peaks to use in downstream + analyses from supplied 3-column BED file. The supplied + peaks file must be sorted by position and not contain + overlapping peaks; comment lines beginning with `#` are + allowed. threads: type: int? @@ -177,7 +205,8 @@ inputs: position: 10 prefix: "--localcores" doc: | - Set max cores the pipeline may request at one time. + Set max cores the pipeline + may request at one time. Default: all available memory_limit: @@ -186,7 +215,8 @@ inputs: position: 11 prefix: "--localmem" doc: | - Set max GB the pipeline may request at one time + Set max GB the pipeline + may request at one time. Default: all available virt_memory_limit: @@ -195,7 +225,8 @@ inputs: position: 12 prefix: "--localvmem" doc: | - Set max virtual address space in GB for the pipeline + Set max virtual address + space in GB for the pipeline. Default: all available @@ -206,21 +237,23 @@ outputs: outputBinding: glob: "sample/outs/web_summary.html" doc: | - Run summary metrics and charts in HTML format + Run summary metrics and charts + in HTML format. metrics_summary_report: type: File outputBinding: glob: "sample/outs/summary.csv" doc: | - Run summary metrics in CSV format + Run summary metrics + in CSV format. barcode_metrics_report: type: File outputBinding: glob: "sample/outs/per_barcode_metrics.csv" doc: | - ATAC and GEX read count summaries generated for every + ATAC and RNA read count summaries generated for every barcode observed in the experiment. The columns contain the paired ATAC and Gene Expression barcode sequences, ATAC and Gene Expression QC metrics for that barcode, @@ -229,16 +262,17 @@ outputs: More details: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/output/per_barcode_metrics - gex_possorted_genome_bam_bai: + rna_possorted_genome_bam_bai: type: File outputBinding: glob: "sample/outs/gex_possorted_bam.bam" secondaryFiles: - .bai doc: | - GEX position-sorted reads aligned to the genome and transcriptome annotated with barcode - information in BAM format - + RNA position-sorted reads aligned to + the genome and transcriptome annotated + with barcode information in BAM format. + atac_possorted_genome_bam_bai: type: File outputBinding: @@ -246,73 +280,82 @@ outputs: secondaryFiles: - .bai doc: | - ATAC position-sorted reads aligned to the genome annotated with barcode - information in BAM format + ATAC position-sorted reads aligned to + the genome annotated with barcode + information in BAM format. filtered_feature_bc_matrix_folder: type: Directory outputBinding: glob: "sample/outs/filtered_feature_bc_matrix" doc: | - Filtered feature barcode matrix stored as a CSC sparse matrix in MEX format. - The rows consist of all the gene and peak features concatenated together - (identical to raw feature barcode matrix) and the columns are restricted to - those barcodes that are identified as cells. + Filtered feature barcode matrix stored as a CSC sparse + matrix in MEX format. The rows consist of all gene and + peak features concatenated together (identical to raw + feature barcode matrix), and the columns are restricted + to barcodes identified as cells. filtered_feature_bc_matrix_h5: type: File outputBinding: glob: "sample/outs/filtered_feature_bc_matrix.h5" doc: | - Filtered feature barcode matrix stored as a CSC sparse matrix in hdf5 format. - The rows consist of all the gene and peak features concatenated together - (identical to raw feature barcode matrix) and the columns are restricted to - those barcodes that are identified as cells. + Filtered feature barcode matrix stored as a CSC sparse + matrix in hdf5 format. The rows consist of all gene and + peak features concatenated together (identical to raw + feature barcode matrix), and the columns are restricted + to barcodes identified as cells. raw_feature_bc_matrices_folder: type: Directory outputBinding: glob: "sample/outs/raw_feature_bc_matrix" doc: | - Raw feature barcode matrix stored as a CSC sparse matrix in MEX format. - The rows consist of all the gene and peak features concatenated together - and the columns consist of all observed barcodes with non-zero signal for - either ATAC or gene expression. + Raw feature barcode matrix stored as a CSC sparse matrix + in MEX format. The rows consist of all gene and peak + features concatenated together, and the columns consist + of all observed barcodes with non-zero signal for either + ATAC or gene expression. raw_feature_bc_matrices_h5: type: File outputBinding: glob: "sample/outs/raw_feature_bc_matrix.h5" doc: | - Raw feature barcode matrix stored as a CSC sparse matrix in hdf5 format. - The rows consist of all the gene and peak features concatenated together - and the columns consist of all observed barcodes with non-zero signal for - either ATAC or gene expression. + Raw feature barcode matrix stored as a CSC sparse matrix + in hdf5 format. The rows consist of all gene and peak + features concatenated together, and the columns consist + of all observed barcodes with non-zero signal for either + ATAC or gene expression. secondary_analysis_report_folder: type: Directory outputBinding: glob: "sample/outs/analysis" doc: | - Various secondary analyses that utilize the ATAC data, the GEX data, and their - linkage: dimensionality reduction and clustering results for the ATAC and GEX - data, differential expression, and differential accessibility for all clustering - results above and linkage between ATAC and GEX data. + Various secondary analyses that utilize the ATAC data, + the RNA data, and their linkage: dimensionality reduction + and clustering results for the ATAC and RNA data, + differential expression, and differential accessibility + for all clustering results, and linkage between ATAC and + RNA data. - gex_molecule_info_h5: + rna_molecule_info_h5: type: File outputBinding: glob: "sample/outs/gex_molecule_info.h5" doc: | - Count and barcode information for every GEX molecule observed in the experiment - in hdf5 format. + Count and barcode information for + every RNA molecule observed in + the experiment in hdf5 format. loupe_browser_track: type: File outputBinding: glob: "sample/outs/cloupe.cloupe" doc: | - Loupe Browser visualization file with all the analysis outputs + Loupe Browser visualization file + with all the analysis outputs. atac_fragments_file: type: File @@ -321,7 +364,8 @@ outputs: secondaryFiles: - .tbi doc: | - Count and barcode information for every ATAC fragment observed in + Count and barcode information for + every ATAC fragment observed in the experiment in TSV format. atac_peaks_bed_file: @@ -329,25 +373,29 @@ outputs: outputBinding: glob: "sample/outs/atac_peaks.bed" doc: | - Locations of open-chromatin regions identified in this sample. - These regions are referred to as "peaks". + Locations of open-chromatin regions + identified in this sample. These + regions are referred to as "peaks". atac_cut_sites_bigwig_file: type: File outputBinding: glob: "sample/outs/atac_cut_sites.bigwig" doc: | - Genome track of observed transposition sites in the experiment - smoothed at a resolution of 400 bases in BIGWIG format. + Genome track of observed transposition + sites in the experiment smoothed at a + resolution of 400 bases in bigWig format. atac_peak_annotation_file: type: File outputBinding: glob: "sample/outs/atac_peak_annotation.tsv" doc: | - Annotations of peaks based on genomic proximity alone. - Note that these are not functional annotations and they - do not make use of linkage with GEX data. + Annotations of peaks based on genomic + proximity alone. Note that these are + not functional annotations and they + do not make use of linkage with RNA + data. stdout_log: type: stdout @@ -369,9 +417,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cell Ranger ARC Count Chromatin Accessibility and Gene Expression" -s:name: "Cell Ranger ARC Count Chromatin Accessibility and Gene Expression" -s:alternateName: "Quantifies chromatin accessibility and gene expression from a single-cell Multiome ATAC/RNA-Seq library" +s:name: "Cell Ranger Count (RNA+ATAC)" +label: "Cell Ranger Count (RNA+ATAC)" +s:alternateName: "Quantifies single-cell gene expression and chromatin accessibility of the sequencing data from a single 10x Genomics library in a combined manner" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-arc-count.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -409,11 +457,11 @@ s:creator: doc: | - Cell Ranger ARC Count Chromatin Accessibility and Gene Expression - ================================================================= + Cell Ranger Count (RNA+ATAC) - Quantifies chromatin accessibility and gene expression from a - single-cell Multiome ATAC/RNA-Seq library. + Quantifies single-cell gene expression and chromatin accessibility + of the sequencing data from a single 10x Genomics library in a + combined manner. Parameters set by default: --disable-ui - no need in any UI when running in Docker container @@ -439,7 +487,7 @@ doc: | Cell Ranger ARC count performs alignment, filtering, barcode counting, - peak calling and counting of both ATAC and GEX molecules. Furthermore, + peak calling and counting of both ATAC and RNA molecules. Furthermore, it uses the Chromium cellular barcodes to generate feature-barcode matrices, perform dimensionality reduction, determine clusters, perform differential analysis on clusters and identify linkages between peaks and genes. The diff --git a/tools/cellranger-arc-mkref.cwl b/tools/cellranger-arc-mkref.cwl index 2e50840e..2f9d4594 100644 --- a/tools/cellranger-arc-mkref.cwl +++ b/tools/cellranger-arc-mkref.cwl @@ -56,10 +56,11 @@ inputs: - "null" - string[] doc: | - Contigs that do not have any chromatin structure, for example, - mitochondria or plastids. These contigs are excluded from peak - calling since the entire contig will be "open" due to a lack of - chromatin structure + Contigs that do not have any chromatin structure, + for example, mitochondria or plastids. These + contigs are excluded from peak calling since the + entire contig will be "open" due to a lack of + chromatin structure. output_folder_name: type: string? @@ -93,15 +94,16 @@ outputs: outputBinding: glob: $(get_output_folder_name()) doc: | - Compatible with Cell Ranger ARC reference folder that includes - STAR and BWA indices + Compatible with Cell Ranger ARC reference + folder that includes STAR and BWA indices. chrom_length_file: type: File outputBinding: glob: $(get_output_folder_name() + "/star/chrNameLength.txt") doc: | - Chromosome length file in TSV format + Chromosome length file + in TSV format. stdout_log: type: stdout @@ -123,9 +125,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cell Ranger ARC Build Reference Indices" -s:name: "Cell Ranger ARC Build Reference Indices" -s:alternateName: "Builds Cell Ranger ARC compatible reference folder from the custom genome FASTA and gene GTF annotation files" +s:name: "Cell Ranger Reference (RNA+ATAC)" +label: "Cell Ranger Reference (RNA+ATAC)" +s:alternateName: "Builds a reference genome of a selected species for quantifying gene expression and chromatin accessibility" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-arc-mkref.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -163,11 +165,10 @@ s:creator: doc: | - Cell Ranger ARC Build Reference Indices - ==================================================================== - - Builds Cell Ranger ARC compatible reference folder from the custom - genome FASTA and gene GTF annotation files. + Cell Ranger Reference (RNA, ATAC, RNA+ATAC) + + Builds a reference genome of a selected species for quantifying + gene expression and chromatin accessibility Notes: - `input_motifs` parameter in the `config.txt` file is not diff --git a/tools/cellranger-atac-aggr.cwl b/tools/cellranger-atac-aggr.cwl index 4a3c275e..62858727 100644 --- a/tools/cellranger-atac-aggr.cwl +++ b/tools/cellranger-atac-aggr.cwl @@ -36,24 +36,29 @@ inputs: secondaryFiles: - .tbi doc: | - Array of files containing count and barcode information for - every ATAC fragment observed in the "cellranger-atac count" - experiment in TSV format. + Array of files containing count and + barcode information for every ATAC + fragment observed in the + "cellranger-atac count" experiment + in TSV format. barcode_metrics_report: type: File[] doc: | - Array of files with per-barcode fragment counts & metrics - produced by "cellranger-atac count" command in CSV format + Array of files with per-barcode + fragment counts & metrics produced + by "cellranger-atac count" command + in CSV format. gem_well_labels: type: - "null" - string[] doc: | - Array of GEM well identifiers to be used for labeling purposes only. - If not provided use rootnames of files from the barcode_metrics_report - input + Array of GEM well identifiers to be + used for labeling purposes only. If + not provided use rootnames of files + from the barcode_metrics_report input. indices_folder: type: Directory @@ -61,9 +66,10 @@ inputs: position: 5 prefix: "--reference" doc: | - Path to folder containing a Cell Ranger ATAC or Cell Ranger - ARC reference. Should be generated by "cellranger-atac mkref" - or "cellranger-arc mkref" commands + Path to folder containing a Cell Ranger + ATAC or Cell Ranger ARC reference. Should + be generated by "cellranger-atac mkref" + or "cellranger-arc mkref" commands. normalization_mode: type: @@ -75,7 +81,8 @@ inputs: position: 6 prefix: "--normalize" doc: | - Library depth normalization mode: depth, none. + Library depth normalization + mode: depth, none. Default: depth threads: @@ -84,7 +91,8 @@ inputs: position: 7 prefix: "--localcores" doc: | - Set max cores the pipeline may request at one time. + Set max cores the pipeline may + request at one time. Default: all available memory_limit: @@ -93,7 +101,8 @@ inputs: position: 8 prefix: "--localmem" doc: | - Set max GB the pipeline may request at one time + Set max GB the pipeline may + request at one time. Default: all available virt_memory_limit: @@ -102,7 +111,8 @@ inputs: position: 9 prefix: "--localvmem" doc: | - Set max virtual address space in GB for the pipeline + Set max virtual address space + in GB for the pipeline. Default: all available @@ -113,28 +123,32 @@ outputs: outputBinding: glob: "aggregated/outs/web_summary.html" doc: | - Run summary metrics and charts in HTML format + Run summary metrics and charts + in HTML format. metrics_summary_report_json: type: File outputBinding: glob: "aggregated/outs/summary.json" doc: | - Run summary metrics in JSON format + Run summary metrics + in JSON format. metrics_summary_report_csv: type: File outputBinding: glob: "aggregated/outs/summary.csv" doc: | - Run summary metrics in CSV format + Run summary metrics + in CSV format. barcode_metrics_report: type: File outputBinding: glob: "aggregated/outs/singlecell.csv" doc: | - Per-barcode fragment counts & metrics in CSV format + Per-barcode fragment counts & + metrics in CSV format. fragments_file: type: File @@ -143,77 +157,86 @@ outputs: secondaryFiles: - .tbi doc: | - Count and barcode information for every ATAC fragment observed - in the aggregated experiment in TSV format + Count and barcode information for + every ATAC fragment observed in + the aggregated experiment in + TSV format. peaks_bed_file: type: File outputBinding: glob: "aggregated/outs/peaks.bed" doc: | - Locations of open-chromatin regions identified in the - aggregated experiment (these regions are referred to - as "peaks") + Locations of open-chromatin regions + identified in the aggregated experiment + (these regions are referred to as "peaks"). peak_annotation_file: type: File outputBinding: glob: "aggregated/outs/peak_annotation.tsv" doc: | - Annotations of peaks based on genomic proximity alone + Annotations of peaks based + on genomic proximity alone. secondary_analysis_report_folder: type: Directory outputBinding: glob: "aggregated/outs/analysis" doc: | - Folder with secondary analysis results + Folder with secondary + analysis results. filtered_feature_bc_matrix_folder: type: Directory outputBinding: glob: "aggregated/outs/filtered_peak_bc_matrix" doc: | - Folder with aggregated filtered peak-barcode matrices - containing only cellular barcodes in MEX format. + Folder with aggregated filtered + peak-barcode matrices containing + only cellular barcodes in MEX format. filtered_feature_bc_matrix_h5: type: File outputBinding: glob: "aggregated/outs/filtered_peak_bc_matrix.h5" doc: | - Aggregated filtered peak-barcode matrices containing - only cellular barcodes in HDF5 format. + Aggregated filtered peak-barcode + matrices containing only cellular + barcodes in HDF5 format. filtered_tf_bc_matrix_folder: type: Directory? outputBinding: glob: "aggregated/outs/filtered_tf_bc_matrix" doc: | - Folder with aggregated filtered tf-barcode matrices - containing only cellular barcodes in MEX format. + Folder with aggregated filtered + tf-barcode matrices containing only + cellular barcodes in MEX format. filtered_tf_bc_matrix_h5: type: File? outputBinding: glob: "aggregated/outs/filtered_tf_bc_matrix.h5" doc: | - Aggregated filtered tf-barcode matrices containing - only cellular barcodes in HDF5 format. + Aggregated filtered tf-barcode matrices + containing only cellular barcodes + in HDF5 format. aggregation_metadata: type: File outputBinding: glob: "aggregated/outs/aggregation_csv.csv" doc: | - Aggregation CSV file + Aggregation CSV file. loupe_browser_track: type: File outputBinding: glob: "aggregated/outs/cloupe.cloupe" doc: | - Loupe Browser visualization and analysis file + Loupe Browser visualization + and analysis file. stdout_log: type: stdout @@ -235,9 +258,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cellranger ATAC Aggregate" -s:name: "Cellranger ATAC Aggregate" -s:alternateName: "Aggregates outputs from multiple runs of Cell Ranger Count Chromatin Accessibility experiments" +label: "Cell Ranger Aggregate (ATAC)" +s:name: "Cell Ranger Aggregate (ATAC)" +s:alternateName: "Combines outputs from multiple runs of Cell Ranger Count (ATAC) pipeline" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-atac-aggr.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -275,10 +298,10 @@ s:creator: doc: | - Cellranger ATAC Aggregate + Cell Ranger Aggregate (ATAC) - Aggregates outputs from multiple runs of Cell Ranger Count Chromatin - Accessibility experiments + Combines outputs from multiple runs of + “Cell Ranger Count (ATAC)” pipeline. Parameters set by default: --disable-ui - no need in any UI when running in Docker container diff --git a/tools/cellranger-atac-count.cwl b/tools/cellranger-atac-count.cwl index bcddf7af..a5dfe03e 100644 --- a/tools/cellranger-atac-count.cwl +++ b/tools/cellranger-atac-count.cwl @@ -47,22 +47,30 @@ inputs: fastq_file_r1: type: File doc: | - FASTQ read 1 file (will be staged into workdir as sample_S1_L001_R1_001.fastq) + FASTQ read 1 file. + It will be staged into workdir + as sample_S1_L001_R1_001.fastq. fastq_file_r2: type: File doc: | - FASTQ read 2 file (will be staged into workdir as sample_S1_L001_R2_001.fastq) + FASTQ read 2 file. + It will be staged into workdir + as sample_S1_L001_R2_001.fastq. fastq_file_r3: type: File doc: | - FASTQ read 3 file (will be staged into workdir as sample_S1_L001_R3_001.fastq) + FASTQ read 3 file. + It will be staged into workdir + as sample_S1_L001_R3_001.fastq. fastq_file_i1: type: File? doc: | - FASTQ index file (if provided, will be staged into workdir as sample_S1_L001_I1_001.fastq) + FASTQ index file. + It will be staged into workdir + as sample_S1_L001_I1_001.fastq. indices_folder: type: Directory @@ -70,9 +78,11 @@ inputs: position: 10 prefix: "--reference" doc: | - Path to folder containing a Cell Ranger ATAC or Cell Ranger - ARC reference. Should be generated by "cellranger-atac mkref" - or "cellranger-arc mkref" commands + Path to folder containing a Cell + Ranger ATAC or Cell Ranger ARC + reference. Should be generated by + "cellranger-atac mkref" or + "cellranger-arc mkref" commands. force_cells: type: int? @@ -80,9 +90,11 @@ inputs: position: 11 prefix: "--force-cells" doc: | - Define the top N barcodes with the most fragments overlapping - peaks as cells. N must be a positive integer <= 20,000. Please - consult the documentation before using this option + Define the top N barcodes with the + most fragments overlapping peaks as + cells. N must be a positive integer + <= 20,000. Please consult the + documentation before using this option. chemistry: type: string? @@ -90,10 +102,11 @@ inputs: position: 12 prefix: "--chemistry" doc: | - Assay configuration. NOTE: by default the assay - configuration is detected automatically. Use - "ARC-v1" to indicate that it is a library from - the multiome assay. + Assay configuration. NOTE: by default + the assay configuration is detected + automatically. Use "ARC-v1" to indicate + that it is a library from the multiome + assay. threads: type: int? @@ -101,7 +114,8 @@ inputs: position: 13 prefix: "--localcores" doc: | - Set max cores the pipeline may request at one time. + Set max cores the pipeline + may request at one time. Default: all available memory_limit: @@ -110,7 +124,8 @@ inputs: position: 14 prefix: "--localmem" doc: | - Set max GB the pipeline may request at one time + Set max GB the pipeline + may request at one time. Default: all available virt_memory_limit: @@ -119,7 +134,8 @@ inputs: position: 15 prefix: "--localvmem" doc: | - Set max virtual address space in GB for the pipeline + Set max virtual address space + in GB for the pipeline. Default: all available @@ -130,28 +146,32 @@ outputs: outputBinding: glob: "sample/outs/web_summary.html" doc: | - Run summary metrics and charts in HTML format + Run summary metrics and charts + in HTML format. metrics_summary_report_json: type: File outputBinding: glob: "sample/outs/summary.json" doc: | - Run summary metrics in JSON format + Run summary metrics + in JSON format. metrics_summary_report_csv: type: File outputBinding: glob: "sample/outs/summary.csv" doc: | - Run summary metrics in CSV format + Run summary metrics + in CSV format. barcode_metrics_report: type: File outputBinding: glob: "sample/outs/singlecell.csv" doc: | - Per-barcode fragment counts & metrics in CSV format + Per-barcode fragment counts & + metrics in CSV format. possorted_genome_bam_bai: type: File? @@ -160,8 +180,10 @@ outputs: secondaryFiles: - .bai doc: | - Indexed position-sorted reads aligned to the genome annotated - with barcode information in BAM format + Indexed position-sorted reads + aligned to the genome annotated + with barcode information in + BAM format. fragments_file: type: File @@ -170,93 +192,110 @@ outputs: secondaryFiles: - .tbi doc: | - Count and barcode information for every ATAC fragment observed - in the experiment in TSV format + Count and barcode information for + every ATAC fragment observed + in the experiment in TSV format. peaks_bed_file: type: File outputBinding: glob: "sample/outs/peaks.bed" doc: | - Locations of open-chromatin regions identified in the - experiment (these regions are referred to as "peaks") + Locations of open-chromatin regions + identified in the experiment (these + regions are referred to as "peaks"). peak_annotation_file: type: File outputBinding: glob: "sample/outs/peak_annotation.tsv" doc: | - Annotations of peaks based on genomic proximity alone + Annotations of peaks based on + genomic proximity alone. cut_sites_bigwig_file: type: File outputBinding: glob: "sample/outs/cut_sites.bigwig" doc: | - Smoothed transposition site track in bigWig format + Smoothed transposition site track + in bigWig format. peak_motif_mapping_bed: type: File? outputBinding: glob: "sample/outs/peak_motif_mapping.bed" doc: | - File with peak-motif associations in BED format + File with peak-motif associations + in BED format. filtered_feature_bc_matrix_folder: type: Directory outputBinding: glob: "sample/outs/filtered_peak_bc_matrix" doc: | - Folder with filtered peak-barcode matrices containing only cellular barcodes in MEX format. + Folder with filtered peak-barcode + matrices containing only cellular + barcodes in MEX format. filtered_feature_bc_matrix_h5: type: File outputBinding: glob: "sample/outs/filtered_peak_bc_matrix.h5" doc: | - Filtered peak-barcode matrices containing only cellular barcodes in HDF5 format. + Filtered peak-barcode matrices + containing only cellular barcodes + in HDF5 format. filtered_tf_bc_matrix_folder: type: Directory? outputBinding: glob: "sample/outs/filtered_tf_bc_matrix" doc: | - Folder with filtered tf-barcode matrices containing only cellular barcodes in MEX format. + Folder with filtered tf-barcode + matrices containing only cellular + barcodes in MEX format. filtered_tf_bc_matrix_h5: type: File? outputBinding: glob: "sample/outs/filtered_tf_bc_matrix.h5" doc: | - Filtered tf-barcode matrices containing only cellular barcodes in HDF5 format. + Filtered tf-barcode matrices containing + only cellular barcodes in HDF5 format. raw_feature_bc_matrices_folder: type: Directory outputBinding: glob: "sample/outs/raw_peak_bc_matrix" doc: | - Folder with unfiltered peak-barcode matrices containing all barcodes in MEX format + Folder with unfiltered peak-barcode + matrices containing all barcodes + in MEX format. raw_feature_bc_matrices_h5: type: File outputBinding: glob: "sample/outs/raw_peak_bc_matrix.h5" doc: | - Unfiltered peak-barcode matrices containing all barcodes in HDF5 format + Unfiltered peak-barcode matrices + containing all barcodes in HDF5 format. secondary_analysis_report_folder: type: Directory outputBinding: glob: "sample/outs/analysis" doc: | - Folder with secondary analysis results + Folder with secondary + analysis results. loupe_browser_track: type: File outputBinding: glob: "sample/outs/cloupe.cloupe" doc: | - Loupe Browser visualization and analysis file + Loupe Browser visualization + and analysis file. stdout_log: type: stdout @@ -320,7 +359,8 @@ s:creator: doc: | Cell Ranger Count (ATAC) - Counts reads from a single scATAC-Seq library. + Quantifies single-cell chromatin accessibility of the + sequencing data from a single 10x Genomics library. Parameters set by default: --disable-ui - no need in any UI when running in Docker container diff --git a/tools/cellranger-count.cwl b/tools/cellranger-count.cwl index 117ae521..14bc3b2e 100644 --- a/tools/cellranger-count.cwl +++ b/tools/cellranger-count.cwl @@ -34,7 +34,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: cumulusprod/cellranger:7.0.0 + dockerPull: cumulusprod/cellranger:8.0.1 inputs: @@ -42,17 +42,24 @@ inputs: fastq_file_r1: type: File doc: | - FASTQ read 1 file (will be staged into workdir as sample_S1_L001_R1_001.fastq) + FASTQ read 1 file. + It will be staged into workdir + as sample_S1_L001_R1_001.fastq. fastq_file_r2: type: File doc: | - FASTQ read 2 file (will be staged into workdir as sample_S1_L001_R2_001.fastq) + FASTQ read 2 file. + It will be staged into workdir + as sample_S1_L001_R2_001.fastq. fastq_file_i1: type: File? doc: | - FASTQ index file (if provided, will be staged into workdir as sample_S1_L001_I1_001.fastq) + FASTQ index file. + If provided, it will be staged + into workdir as + sample_S1_L001_I1_001.fastq. indices_folder: type: Directory @@ -60,8 +67,10 @@ inputs: position: 5 prefix: "--transcriptome" doc: | - Path of folder containing 10x-compatible transcriptome reference. - Should be generated by "cellranger mkref" command + Path of folder containing 10x-compatible + transcriptome reference. These indices + should be generated by "cellranger mkref" + command. r1_length: type: int? @@ -69,11 +78,14 @@ inputs: position: 6 prefix: "--r1-length" doc: | - Limit the length of the input Read 1 sequence of Gene Expression library - to the first N bases, where N is a user-supplied value. Note that the length - includes the 10x Barcode and UMI sequences so do not set this below 26 for - Single Cell 3′ v2 or Single Cell 5′. This and --r2-length are useful options - for determining the optimal read length for sequencing. + Limit the length of the input Read 1 sequence + of Gene Expression library to the first N bases, + where N is a user-supplied value. Note that the + length includes the 10x Barcode and UMI sequences + so do not set this below 26 for Single Cell 3′ v2 + or Single Cell 5′. This and --r2-length are useful + options for determining the optimal read length + for sequencing. r2_length: type: int? @@ -81,9 +93,11 @@ inputs: position: 7 prefix: "--r2-length" doc: | - Limit the length of the input R2 sequence to the first N bases, where N is a - user-supplied value. Trimming occurs before sequencing metrics are computed - and therefore, limiting R2 read length may affect Q30 scores. + Limit the length of the input R2 sequence to the + first N bases, where N is a user-supplied value. + Trimming occurs before sequencing metrics are + computed and therefore, limiting R2 read length + may affect Q30 scores. expect_cells: type: int? @@ -92,9 +106,10 @@ inputs: prefix: "--expect-cells" doc: | Expected number of recovered cells. - Starting in Cell Ranger 7.0, the expected number of cells can be either auto-estimated - or specified with --expect-cells. To replicate an old cellranger count analysis, set - this parameter to 3,000 cells. + Starting in Cell Ranger 7.0, the expected number + of cells can be either auto-estimated or specified + with --expect-cells. To replicate an old cellranger + count analysis, set this parameter to 3,000 cells. force_cells: type: int? @@ -102,30 +117,42 @@ inputs: position: 9 prefix: "--force-cells" doc: | - Force pipeline to use this number of cells, bypassing the cell detection algorithm. - Use this if the number of cells estimated by Cell Ranger is not consistent with the - barcode rank plot. + Force pipeline to use this number of cells, bypassing + the cell detection algorithm. Use this if the number + of cells estimated by Cell Ranger is not consistent + with the barcode rank plot. no_bam: type: boolean? + default: false inputBinding: + prefix: "--create-bam=" + valueFrom: $(self?"false":"true") + separate: false position: 10 - prefix: "--no-bam" doc: | - Set this flag to not generate the BAM file. This will reduce the total computation - time for the pipestance and the size of the output directory. If unsure, we recommend - not to use this option. BAM file could be useful for troubleshooting and downstream - analysis + Enable or disable BAM file generation. Setting + --create-bam=false reduces the total computation + time and the size of the output directory (BAM + file not generated). We recommend setting + --create-bam=true if unsure. See + https://10xgen.com/create-bam for additional + guidance [possible values: true, false] exclude_introns: type: boolean? + default: false inputBinding: + prefix: "--include-introns=" + valueFrom: $(self?"false":"true") + separate: false position: 11 - prefix: "--include-introns=false" doc: | - In Cell Ranger v7.0 intronic reads are counted by default for whole transcriptome - gene expression data, except when --target-panel is used. Therefore, here we provide - a flag to disable this default behavior. + Starting from the Cell Ranger v7.0 the intronic reads + are counted by default for whole transcriptome gene + expression data, except when --target-panel is used. + Therefore, here we provide a flag to disable this + default behavior. threads: type: int? @@ -162,14 +189,14 @@ outputs: outputBinding: glob: "sample/outs/web_summary.html" doc: | - Run summary metrics and charts in HTML format + Run summary metrics and charts in HTML format. metrics_summary_report: type: File outputBinding: glob: "sample/outs/metrics_summary.csv" doc: | - Run summary metrics in CSV format + Run summary metrics in CSV format. possorted_genome_bam_bai: type: File? @@ -178,58 +205,66 @@ outputs: secondaryFiles: - .bai doc: | - Indexed reads aligned to the genome and transcriptome annotated with barcode information + Indexed reads aligned to the genome + and transcriptome annotated with + barcode information. filtered_feature_bc_matrix_folder: type: Directory outputBinding: glob: "sample/outs/filtered_feature_bc_matrix" doc: | - Folder with filtered feature-barcode matrices containing only cellular barcodes in MEX format. + Folder with filtered feature-barcode matrices + containing only cellular barcodes in MEX format. filtered_feature_bc_matrix_h5: type: File outputBinding: glob: "sample/outs/filtered_feature_bc_matrix.h5" doc: | - Filtered feature-barcode matrices containing only cellular barcodes in HDF5 format. + Filtered feature-barcode matrices containing + only cellular barcodes in HDF5 format. raw_feature_bc_matrices_folder: type: Directory outputBinding: glob: "sample/outs/raw_feature_bc_matrix" doc: | - Folder with unfiltered feature-barcode matrices containing all barcodes in MEX format + Folder with unfiltered feature-barcode matrices + containing all barcodes in MEX format. raw_feature_bc_matrices_h5: type: File outputBinding: glob: "sample/outs/raw_feature_bc_matrix.h5" doc: | - Unfiltered feature-barcode matrices containing all barcodes in HDF5 format + Unfiltered feature-barcode matrices containing + all barcodes in HDF5 format. secondary_analysis_report_folder: type: Directory outputBinding: glob: "sample/outs/analysis" doc: | - Folder with secondary analysis results including dimensionality reduction, - cell clustering, and differential expression + Folder with secondary analysis results + including dimensionality reduction, cell + clustering, and differential expression. molecule_info_h5: type: File outputBinding: glob: "sample/outs/molecule_info.h5" doc: | - Molecule-level information used by cellranger aggr to aggregate samples into - larger datasets + Molecule-level information used by cellranger + aggr to aggregate samples into larger datasets. loupe_browser_track: type: File outputBinding: glob: "sample/outs/cloupe.cloupe" doc: | - Loupe Browser visualization and analysis file + Loupe Browser visualization and + analysis file. stdout_log: type: stdout @@ -251,9 +286,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cell Ranger Count Gene Expression" -s:name: "Cell Ranger Count Gene Expression" -s:alternateName: "Quantifies gene expression from a single scRNA-Seq library" +s:name: "Cell Ranger Count (RNA)" +label: "Cell Ranger Count (RNA)" +s:alternateName: "Quantifies single-cell gene expression of the sequencing data from a single 10x Genomics library" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-count.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -291,12 +326,14 @@ s:creator: doc: | - Cell Ranger Count Gene Expression + Cell Ranger Count (RNA) - Quantifies gene expression from a single-cell RNA-Seq library. + Quantifies single-cell gene expression of the sequencing + data from a single 10x Genomics library. - New in Cell Ranger v7.0: Intronic reads are counted by default for - whole transcriptome gene expression data. For more details see + Starting from the Cell Ranger v7.0 the intronic reads are + counted by default for whole transcriptome gene expression + data. For more details see https://support.10xgenomics.com/docs/intron-mode-rec Input parameters for Feature Barcode, Targeted Gene Expression @@ -322,6 +359,7 @@ doc: | --chemistry - cell ranger will autodetect the library by default --no-libraries - used only in Feature Barcode analysis --check-library-compatibility - no reason to disable it + --min-crispr-umi - needed only for Protospacer calling (for pooled CRISPR screens) --no-target-umi-filter - needed only for Targeted Gene Expression analysis --dry - not applicable to our use case --jobmode - we use default local mode @@ -332,58 +370,57 @@ doc: | --uiport - we disabled UI --noexit - we disabled UI --nopreflight - no reason to skip preflight checks + --output-dir - will be saved to the sample/outs by default s:about: | - Count gene expression (targeted or whole-transcriptome) and/or feature barcode reads - from a single sample and GEM well - - USAGE: - cellranger count [OPTIONS] --id --transcriptome - - OPTIONS: - --id A unique run id and output folder name [a-zA-Z0-9_-]+ - --description Sample description to embed in output files [default: ] - --transcriptome Path of folder containing 10x-compatible transcriptome reference - --fastqs Path to input FASTQ data - --project Name of the project folder within a mkfastq or bcl2fastq-generated folder from which to pick FASTQs - --sample Prefix of the filenames of FASTQs to select - --lanes Only use FASTQs from selected lanes - --libraries CSV file declaring input library data sources - --feature-ref Feature reference CSV file, declaring Feature Barcode constructs and associated barcodes - --target-panel The target panel CSV file declaring the target panel used, if any. Default analysis will exclude intronic mapped reads, which - is the recommended mode for targeted assay. Use include-introns=true to include intronic mapped reads in analysis - --expect-cells Expected number of recovered cells, used as input to cell calling algorithm - --force-cells Force pipeline to use this number of cells, bypassing cell calling algorithm. [MINIMUM: 10] - --no-bam Set --no-bam to not generate the BAM file. This will reduce the total computation time for the pipestance and the size of the - output directory. If unsure, we recommend not to use this option. BAM file could be useful for troubleshooting and downstream - analysis - --nosecondary Disable secondary analysis, e.g. clustering. Optional - --r1-length Hard trim the input Read 1 to this length before analysis - --r2-length Hard trim the input Read 2 to this length before analysis - --include-introns Include intronic reads in count (default=true unless --target-panel is specified in which case default=false) - --chemistry Assay configuration. NOTE: by default the assay configuration is detected automatically, which is the recommened mode. You - usually will not need to specify a chemistry. Options are: 'auto' for autodetection, 'threeprime' for Single Cell 3', - 'fiveprime' for Single Cell 5', 'SC3Pv1' or 'SC3Pv2' or 'SC3Pv3' for Single Cell 3' v1/v2/v3, 'SC3Pv3LT' for Single Cell 3' - v3 LT, 'SC3Pv3HT' for Single Cell 3' v3 HT, 'SC5P-PE' or 'SC5P-R2' for Single Cell 5', paired-end/R2-only, 'SC-FB' for Single - Cell Antibody-only 3' v2 or 5' [default: auto] - --no-libraries Proceed with processing using a --feature-ref but no Feature Barcode libraries specified with the 'libraries' flag - --check-library-compatibility Whether to check for barcode compatibility between libraries. [default: true] - --no-target-umi-filter Turn off the target UMI filtering subpipeline. Only applies when --target-panel is used - --dry Do not execute the pipeline. Generate a pipeline invocation (.mro) file and stop - --jobmode Job manager to use. Valid options: local (default), sge, lsf, slurm or path to a .template file. Search for help on "Cluster - Mode" at support.10xgenomics.com for more details on configuring the pipeline to use a compute cluster [default: local] - --localcores Set max cores the pipeline may request at one time. Only applies to local jobs - --localmem Set max GB the pipeline may request at one time. Only applies to local jobs - --localvmem Set max virtual address space in GB for the pipeline. Only applies to local jobs - --mempercore Reserve enough threads for each job to ensure enough memory will be available, assuming each core on your cluster has at least - this much memory available. Only applies to cluster jobmodes - --maxjobs Set max jobs submitted to cluster at one time. Only applies to cluster jobmodes - --jobinterval Set delay between submitting jobs to cluster, in ms. Only applies to cluster jobmodes - --overrides The path to a JSON file that specifies stage-level overrides for cores and memory. Finer-grained than --localcores, + Count gene expression and/or feature barcode reads from a single sample and GEM well + + Usage: cellranger count [OPTIONS] --id --transcriptome --create-bam + + Options: + --id A unique run id and output folder name [a-zA-Z0-9_-]+ + --description Sample description to embed in output files [default: ] + --transcriptome Path of folder containing 10x-compatible transcriptome reference + --fastqs Path to input FASTQ data + --project Name of the project folder within a mkfastq or bcl2fastq-generated folder from which to pick FASTQs + --sample Prefix of the filenames of FASTQs to select + --lanes Only use FASTQs from selected lanes + --libraries CSV file declaring input library data sources + --feature-ref Feature reference CSV file, declaring Feature Barcode constructs and associated barcodes + --expect-cells Expected number of recovered cells, used as input to cell calling algorithm + --force-cells Force pipeline to use this number of cells, bypassing cell calling algorithm. [MINIMUM: 10] + --create-bam Enable or disable BAM file generation. Setting --create-bam=false reduces the total computation time and the size of the + output directory (BAM file not generated). We recommend setting --create-bam=true if unsure. See + https://10xgen.com/create-bam for additional guidance [possible values: true, false] + --nosecondary Disable secondary analysis, e.g. clustering. Optional + --r1-length Hard trim the input Read 1 to this length before analysis + --r2-length Hard trim the input Read 2 to this length before analysis + --include-introns Include intronic reads in count [default: true] [possible values: true, false] + --chemistry Assay configuration. NOTE: by default the assay configuration is detected automatically, which is the recommended mode. + You usually will not need to specify a chemistry. Options are: 'auto' for autodetection, 'threeprime' for Single Cell + 3', 'fiveprime' for Single Cell 5', 'SC3Pv1' or 'SC3Pv2' or 'SC3Pv3' or 'SC3Pv4' for Single Cell 3' v1/v2/v3/v4, + 'SC3Pv3LT' for Single Cell 3' v3 LT, 'SC3Pv3HT' for Single Cell 3' v3 HT, 'SC5P-PE' or 'SC5P-PE-v3' or 'SC5P-R2' or + 'SC5P-R2-v3', for Single Cell 5', paired-end/R2-only, 'SC-FB' for Single Cell Antibody-only 3' v2 or 5'. To analyze the + GEX portion of multiome data, chemistry must be set to 'ARC-v1' [default: auto] + --no-libraries Proceed with processing using a --feature-ref but no Feature Barcode libraries specified with the 'libraries' flag + --check-library-compatibility Whether to check for barcode compatibility between libraries. [default: true] [possible values: true, false] + --min-crispr-umi Minimum CRISPR UMI threshold [default: 3] + --dry Do not execute the pipeline. Generate a pipeline invocation (.mro) file and stop + --jobmode Job manager to use. Valid options: local (default), sge, lsf, slurm or path to a .template file. Search for help on + "Cluster Mode" at support.10xgenomics.com for more details on configuring the pipeline to use a compute cluster + --localcores Set max cores the pipeline may request at one time. Only applies to local jobs + --localmem Set max GB the pipeline may request at one time. Only applies to local jobs + --localvmem Set max virtual address space in GB for the pipeline. Only applies to local jobs + --mempercore Reserve enough threads for each job to ensure enough memory will be available, assuming each core on your cluster has at + least this much memory available. Only applies to cluster jobmodes + --maxjobs Set max jobs submitted to cluster at one time. Only applies to cluster jobmodes + --jobinterval Set delay between submitting jobs to cluster, in ms. Only applies to cluster jobmodes + --overrides The path to a JSON file that specifies stage-level overrides for cores and memory. Finer-grained than --localcores, --mempercore and --localmem. Consult https://support.10xgenomics.com/ for an example override file - --uiport Serve web UI at http://localhost:PORT - --disable-ui Do not serve the web UI - --noexit Keep web UI running after pipestance completes or fails - --nopreflight Skip preflight checks - -h, --help Print help information \ No newline at end of file + --output-dir Output the results to this directory + --uiport Serve web UI at http://localhost:PORT + --disable-ui Do not serve the web UI + --noexit Keep web UI running after pipestance completes or fails + --nopreflight Skip preflight checks + -h, --help Print help \ No newline at end of file diff --git a/tools/cellranger-mkref.cwl b/tools/cellranger-mkref.cwl index 6ded927b..c44f5c72 100644 --- a/tools/cellranger-mkref.cwl +++ b/tools/cellranger-mkref.cwl @@ -11,12 +11,12 @@ requirements: return (root == "")?inputs.genome_fasta_file.basename:root; } else { return inputs.output_folder_name; - } + } }; hints: - class: DockerRequirement - dockerPull: cumulusprod/cellranger:7.0.0 + dockerPull: cumulusprod/cellranger:8.0.1 inputs: @@ -50,11 +50,14 @@ inputs: threads: type: int? inputBinding: + valueFrom: $(["--nthreads", self, "--localcores", self]) position: 8 - prefix: "--nthreads" doc: | - Number of threads used during STAR genome index - Default: 1 + Number of threads used during STAR + genome index. And the max cores the + pipeline may request at one time. + Default: 1 for --nthreads and all + available for --localcores memory_limit: type: int? @@ -103,9 +106,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cell Ranger Build Reference Indices" -s:name: "Cell Ranger Build Reference Indices" -s:alternateName: "Builds Cell Ranger compatible reference folder from the custom genome FASTA and gene GTF annotation files" +label: "Cell Ranger Reference (RNA)" +s:name: "Cell Ranger Reference (RNA)" +s:alternateName: "Builds a reference genome of a selected species for quantifying gene expression" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-mkref.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -143,44 +146,58 @@ s:creator: doc: | - Cell Ranger Build Reference Indices + Cell Ranger Reference (RNA) - Builds Cell Ranger compatible reference folder from - the custom genome FASTA and gene GTF annotation files. + Builds a reference genome of a selected species + for quantifying gene expression. + + Both --nthreads and --localcores parameters are + configured through "threads" input. s:about: | - Build a Cell Ranger-compatible reference folder from user-supplied genome - FASTA and gene GTF files. Creates a new folder named after the genome. - - The commands below should be preceded by 'cellranger': - - Usage: - mkref - --genome=NAME ... - --fasta=PATH ... - --genes=PATH ... - [options] - mkref -h | --help | --version - - Arguments: - genome Unique genome name(s), used to name output folder - [a-zA-Z0-9_-]+. Specify multiple genomes by - specifying the --genome argument multiple times; the - output folder will be _and_. - fasta Path(s) to FASTA file containing your genome reference. - Specify multiple genomes by specifying the --fasta - argument multiple times. - genes Path(s) to genes GTF file(S) containing annotated genes - for your genome reference. Specify multiple genomes - by specifying the --genes argument multiple times. + Prepare a reference for use with 10x analysis software. Requires a GTF and FASTA + + Usage: cellranger mkref [OPTIONS] --genome --fasta --genes Options: - --nthreads= Number of threads used during STAR genome index - generation. Defaults to 1. - --memgb= Maximum memory (GB) used when aligning reads with STAR. - Defaults to 16. - --ref-version= Optional reference version string to include with - reference. - -h --help Show this message. - --version Show version. \ No newline at end of file + --genome Unique genome name, used to name output folder [a-zA-Z0-9_-]+. Specify + multiple genomes by specifying this argument multiple times; the output + folder will be _and_ + --fasta Path to FASTA file containing your genome reference. Specify multiple + genomes by specifying this argument multiple times + --genes Path to genes GTF file containing annotated genes for your genome + reference. Specify multiple genomes by specifying this argument multiple + times + --nthreads Number of threads used during STAR genome index generation. Defaults to 1 + [default: 1] + --memgb Maximum memory (GB) used [default: 16] + --ref-version Optional reference version string to include with reference + --dry Do not execute the pipeline. Generate a pipeline invocation (.mro) file + and stop + --jobmode Job manager to use. Valid options: local (default), sge, lsf, slurm or + path to a .template file. Search for help on "Cluster Mode" at + support.10xgenomics.com for more details on configuring the pipeline to + use a compute cluster + --localcores Set max cores the pipeline may request at one time. Only applies to local + jobs + --localmem Set max GB the pipeline may request at one time. Only applies to local + jobs + --localvmem Set max virtual address space in GB for the pipeline. Only applies to + local jobs + --mempercore Reserve enough threads for each job to ensure enough memory will be + available, assuming each core on your cluster has at least this much + memory available. Only applies to cluster jobmodes + --maxjobs Set max jobs submitted to cluster at one time. Only applies to cluster + jobmodes + --jobinterval Set delay between submitting jobs to cluster, in ms. Only applies to + cluster jobmodes + --overrides The path to a JSON file that specifies stage-level overrides for cores + and memory. Finer-grained than --localcores, --mempercore and --localmem. + Consult https://support.10xgenomics.com/ for an example override file + --output-dir Output the results to this directory + --uiport Serve web UI at http://localhost:PORT + --disable-ui Do not serve the web UI + --noexit Keep web UI running after pipestance completes or fails + --nopreflight Skip preflight checks + -h, --help Print help \ No newline at end of file diff --git a/tools/cellranger-mkvdjref.cwl b/tools/cellranger-mkvdjref.cwl index e8a80c71..f2269982 100644 --- a/tools/cellranger-mkvdjref.cwl +++ b/tools/cellranger-mkvdjref.cwl @@ -11,12 +11,12 @@ requirements: return (root == "")?inputs.genome_fasta_file.basename:root; } else { return inputs.output_folder_name; - } + } }; hints: - class: DockerRequirement - dockerPull: cumulusprod/cellranger:7.0.0 + dockerPull: cumulusprod/cellranger:8.0.1 inputs: @@ -27,7 +27,8 @@ inputs: position: 5 prefix: "--fasta" doc: | - Genome FASTA file. Hard/soft-masked files are not allowed. + Genome FASTA file. Hard/soft-masked + files are not allowed. annotation_gtf_file: type: File @@ -35,7 +36,8 @@ inputs: position: 6 prefix: "--genes" doc: | - GTF annotation file. Should include gene_biotype/transcript_biotype fields + GTF annotation file. Should include + gene_biotype/transcript_biotype fields output_folder_name: type: string? @@ -45,7 +47,26 @@ inputs: valueFrom: $(get_output_folder_name()) default: "" doc: | - Unique genome name, used to name output folder + Unique genome name, used + to name the output folder + + threads: + type: int? + inputBinding: + position: 8 + prefix: "--localcores" + doc: | + Set max cores the pipeline may request at one time. + Default: all available + + memory_limit: + type: int? + inputBinding: + position: 9 + prefix: "--memgb" + doc: | + Maximum memory (GB) used. + Defaults: 16 outputs: @@ -55,8 +76,9 @@ outputs: outputBinding: glob: $(get_output_folder_name()) doc: | - Cell Ranger V(D)J-compatible reference folder. - This folder will include V(D)J segment FASTA file. + Cell Ranger V(D)J-compatible reference + folder. This folder will include V(D)J + segment FASTA file. stdout_log: type: stdout @@ -78,9 +100,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cell Ranger Build V(D)J Reference Indices" -s:name: "Cell Ranger Build V(D)J Reference Indices" -s:alternateName: "Build a Cell Ranger V(D)J-compatible reference folder from a user-supplied genome FASTA and gene GTF files" +label: "Cell Ranger Reference (VDJ)" +s:name: "Cell Ranger Reference (VDJ)" +s:alternateName: "Builds a reference genome of a selected species for V(D)J contigs assembly and clonotype calling" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-mkvdjref.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -118,59 +140,100 @@ s:creator: doc: | - Cell Ranger Build V(D)J Reference Indices - - Build a Cell Ranger V(D)J-compatible reference folder from: - 1) A user-supplied genome FASTA and gene GTF files. - For example, using files from ENSEMBL. - 2) A FASTA file containing V(D)J segments as per the mkvdjref spec. - For example, using files from IMGT. + Cell Ranger Reference (VDJ) + + Builds a reference genome of a selected species for V(D)J + contigs assembly and clonotype calling. - For simplicity purpose only option 1) is supported - user need to - provide GTF annotation file, input --seqs is not implemented. + Input --seqs is not implemented. Chromosome names in GTF file should correspond to the chromosome names in FASTA file. s:about: | - Reference preparation tool for 10x Genomics Cell Ranger V(D)J assembler. + Prepare a reference for use with CellRanger VDJ. - Build a Cell Ranger V(D)J-compatible reference folder from: - 1) A user-supplied genome FASTA and gene GTF files. - For example, using files from ENSEMBL. - OR - 2) A FASTA file containing V(D)J segments as per the mkvdjref spec. - For example, using files from IMGT. + Build a Cell Ranger V(D)J-compatible reference folder from: 1) A user-supplied genome FASTA and gene GTF files. For + example, using files from ENSEMBL. OR 2) A FASTA file containing V(D)J segments as per the mkvdjref spec. For example, + using files from IMGT. Creates a new folder named after the genome. - The commands below should be preceded by 'cellranger': - - Usage: - mkvdjref --genome=NAME --fasta=PATH --genes=PATH ...[options] - mkvdjref --genome=NAME --seqs=PATH [options] - mkvdjref -h | --help | --version - - Arguments: - genome A unique genome name, used to name output folder - [a-zA-Z0-9_-]+. - fasta Path to FASTA file containing your genome reference. - genes One or more GTF files containing annotated genes for - your genome reference. Specify multiple files by - specifying the --genes argument multiple times. The - files will be concatenated. - seqs A FASTA file that directly specifies V(D)J sequences. - This is mutually exclusive with the the "fasta" and - "genes" args above. + Usage: cellranger mkvdjref [OPTIONS] --genome Options: - --ref-version= - Optional reference version string to include. - --rm-transcripts=PATH - Path to text file with transcript IDs to ignore. This - file should have one transcript ID per line where - the IDs correspond to the "transcript_id" key in the - GTF info column. - -h --help Show this message. - --version Show version. + --genome + Unique genome name, used to name output folder [a-zA-Z0-9_-]+ + + --fasta + Path to FASTA file containing your genome reference + + --genes + Path to genes GTF file containing annotated genes for your genome reference. Specify multiple genomes by + specifying this argument multiple times + + --seqs + Path to a FASTA file that directly specifies V(D)J sequences. This is mutually exclusive with the "fasta" and + "genes" args + + --rm-transcripts + Path to text file with transcript IDs to ignore. This file should have one transcript ID per line where the IDs + correspond to the "transcript_id" key in the GTF info column + + --memgb + Maximum memory (GB) used + + [default: 16] + + --ref-version + Optional reference version string to include with reference + + --dry + Do not execute the pipeline. Generate a pipeline invocation (.mro) file and stop + + --jobmode + Job manager to use. Valid options: local (default), sge, lsf, slurm or path to a .template file. Search for help + on "Cluster Mode" at support.10xgenomics.com for more details on configuring the pipeline to use a compute + cluster + + --localcores + Set max cores the pipeline may request at one time. Only applies to local jobs + + --localmem + Set max GB the pipeline may request at one time. Only applies to local jobs + + --localvmem + Set max virtual address space in GB for the pipeline. Only applies to local jobs + + --mempercore + Reserve enough threads for each job to ensure enough memory will be available, assuming each core on your + cluster has at least this much memory available. Only applies to cluster jobmodes + + --maxjobs + Set max jobs submitted to cluster at one time. Only applies to cluster jobmodes + + --jobinterval + Set delay between submitting jobs to cluster, in ms. Only applies to cluster jobmodes + + --overrides + The path to a JSON file that specifies stage-level overrides for cores and memory. Finer-grained than + --localcores, --mempercore and --localmem. Consult https://support.10xgenomics.com/ for an example override file + + --output-dir + Output the results to this directory + + --uiport + Serve web UI at http://localhost:PORT + + --disable-ui + Do not serve the web UI + + --noexit + Keep web UI running after pipestance completes or fails + + --nopreflight + Skip preflight checks + + -h, --help + Print help (see a summary with '-h') diff --git a/tools/cellranger-multi.cwl b/tools/cellranger-multi.cwl index 95efc03b..d66a44a3 100644 --- a/tools/cellranger-multi.cwl +++ b/tools/cellranger-multi.cwl @@ -18,13 +18,13 @@ requirements: ${ var listing = [ { - "entry": inputs.gex_fastq_file_r1, - "entryname": "gex_S1_L001_R1_001.fastq", + "entry": inputs.rna_fastq_file_r1, + "entryname": "rna_S1_L001_R1_001.fastq", "writable": true }, { - "entry": inputs.gex_fastq_file_r2, - "entryname": "gex_S1_L001_R2_001.fastq", + "entry": inputs.rna_fastq_file_r2, + "entryname": "rna_S1_L001_R2_001.fastq", "writable": true }, { @@ -39,30 +39,31 @@ requirements: }, { "entry":`[gene-expression] - reference,${inputs.gex_indices_folder.path} + reference,${inputs.rna_indices_folder.path} + create-bam,${inputs.no_bam?"false":"true"} [vdj] reference,${inputs.vdj_indices_folder.path} [libraries] fastq_id,fastqs,lanes,feature_types - gex,${runtime.outdir},1,gene expression, + rna,${runtime.outdir},1,gene expression, vdj,${runtime.outdir},1,${inputs.vdj_chain_type}`, "entryname": "libraries.csv" } ] - if (inputs.gex_fastq_file_i1){ + if (inputs.rna_fastq_file_i1){ listing.push( { - "entry": inputs.gex_fastq_file_i1, - "entryname": "gex_S1_L001_I1_001.fastq", + "entry": inputs.rna_fastq_file_i1, + "entryname": "rna_S1_L001_I1_001.fastq", "writable": true } ); }; - if (inputs.gex_fastq_file_i2){ + if (inputs.rna_fastq_file_i2){ listing.push( { - "entry": inputs.gex_fastq_file_i2, - "entryname": "gex_S1_L001_I2_001.fastq", + "entry": inputs.rna_fastq_file_i2, + "entryname": "rna_S1_L001_I2_001.fastq", "writable": true } ); @@ -91,62 +92,82 @@ requirements: hints: - class: DockerRequirement - dockerPull: cumulusprod/cellranger:7.0.0 + dockerPull: cumulusprod/cellranger:8.0.1 inputs: - - gex_fastq_file_r1: + + rna_fastq_file_r1: type: File doc: | - GEX FASTQ read 1 file (will be staged into workdir as gex_S1_L001_R1_001.fastq) + RNA FASTQ read 1 file. + It will be staged into workdir + as rna_S1_L001_R1_001.fastq. - gex_fastq_file_r2: + rna_fastq_file_r2: type: File doc: | - GEX FASTQ read 2 file (will be staged into workdir as gex_S1_L001_R2_001.fastq) + RNA FASTQ read 2 file. + It will be staged into workdir + as rna_S1_L001_R2_001.fastq. - gex_fastq_file_i1: + rna_fastq_file_i1: type: File? doc: | - GEX FASTQ index i7 file (will be staged into workdir as gex_S1_L001_I1_001.fastq) + RNA FASTQ index i7 file. + It will be staged into workdir + as rna_S1_L001_I1_001.fastq. - gex_fastq_file_i2: + rna_fastq_file_i2: type: File? doc: | - GEX FASTQ index i5 file (will be staged into workdir as gex_S1_L001_I2_001.fastq) + RNA FASTQ index i5 file. + It will be staged into workdir + as rna_S1_L001_I2_001.fastq. vdj_fastq_file_r1: type: File doc: | - V(D)J FASTQ read 1 file (will be staged into workdir as vdj_S1_L001_R1_001.fastq) + V(D)J FASTQ read 1 file. + It will be staged into workdir + as vdj_S1_L001_R1_001.fastq. vdj_fastq_file_r2: type: File doc: | - V(D)J FASTQ read 2 file (will be staged into workdir as vdj_S1_L001_R2_001.fastq) + V(D)J FASTQ read 2 file. + It will be staged into workdir + as vdj_S1_L001_R2_001.fastq. vdj_fastq_file_i1: type: File? doc: | - V(D)J FASTQ index i7 file (will be staged into workdir as vdj_S1_L001_I1_001.fastq) + V(D)J FASTQ index i7 file. + It will be staged into workdir + as vdj_S1_L001_I1_001.fastq. vdj_fastq_file_i2: type: File? doc: | - V(D)J FASTQ index i5 file (will be staged into workdir as vdj_S1_L001_I2_001.fastq) + V(D)J FASTQ index i5 file. + It will be staged into workdir + as vdj_S1_L001_I2_001.fastq. - gex_indices_folder: + rna_indices_folder: type: Directory doc: | - Path of folder containing 10x-compatible transcriptome reference. - Should be generated by "cellranger mkref" command + Path of folder containing 10x- + compatible transcriptome reference. + Should be generated by "cellranger + mkref" command. vdj_indices_folder: type: Directory doc: | - Path of folder containing Cell Ranger V(D)J-compatible reference. - Should be generated by "cellranger mkvdjref" command + Path of folder containing Cell Ranger + V(D)J-compatible reference. Should be + generated by "cellranger mkvdjref" + command. vdj_chain_type: type: @@ -160,10 +181,24 @@ inputs: - "VDJ-T-GD" default: "VDJ" doc: | - V(D)J chain type. Setting to VDJ will auto-detect the chain type. - Auto-detection does not work for TRG/D (gamma-delta) chains. - Note that gamma-delta analysis is enabled but the algorithm has - not been tested extensively. + V(D)J chain type. Setting to VDJ will + auto-detect the chain type. Auto-detection + does not work for TRG/D (gamma-delta) + chains. Note that gamma-delta analysis is + enabled but the algorithm has not been + tested extensively. + + no_bam: + type: boolean? + default: false + doc: | + Enable or disable BAM file generation. Setting + create-bam to false reduces the total computation + time and the size of the output directory (BAM + file not generated). We recommend setting + create-bam to true if unsure. See + https://10xgen.com/create-bam for additional + guidance [possible values: true, false] threads: type: int? @@ -171,8 +206,8 @@ inputs: position: 10 prefix: "--localcores" doc: | - Set max cores the pipeline may request at one time. - Default: all available + Set max cores the pipeline may request + at one time. Default: all available. memory_limit: type: int? @@ -180,8 +215,8 @@ inputs: position: 11 prefix: "--localmem" doc: | - Set max GB the pipeline may request at one time - Default: all available + Set max GB the pipeline may request + at one time. Default: all available. virt_memory_limit: type: int? @@ -189,8 +224,8 @@ inputs: position: 12 prefix: "--localvmem" doc: | - Set max virtual address space in GB for the pipeline - Default: all available + Set max virtual address space in GB + for the pipeline. Default: all available. outputs: @@ -200,75 +235,91 @@ outputs: outputBinding: glob: "sample/outs/per_sample_outs/sample/web_summary.html" doc: | - Run summary metrics and charts in HTML format + Run summary metrics and charts + in HTML format. metrics_summary_report: type: File outputBinding: glob: "sample/outs/per_sample_outs/sample/metrics_summary.csv" doc: | - Run summary metrics in CSV format + Run summary metrics in CSV format. possorted_genome_bam_bai: - type: File + type: File? outputBinding: glob: "sample/outs/per_sample_outs/sample/count/sample_alignments.bam" secondaryFiles: - .bai doc: | - Indexed GEX BAM file containing position-sorted reads aligned to the genome - and transcriptome, as well as unaligned reads. + Indexed RNA BAM file containing + position-sorted reads aligned to + the genome and transcriptome, as + well as unaligned reads. filtered_feature_bc_matrix_folder: type: Directory outputBinding: glob: "sample/outs/per_sample_outs/sample/count/sample_filtered_feature_bc_matrix" doc: | - Folder with filtered feature-barcode matrices containing only cellular - barcodes in MEX format. Each element of the matrix is the number of UMIs - associated with a feature (row) and a barcode (column). + Folder with filtered feature-barcode + matrices containing only cellular + barcodes in MEX format. Each element + of the matrix is the number of UMIs + associated with a feature (row) and + a barcode (column). filtered_feature_bc_matrix_h5: type: File outputBinding: glob: "sample/outs/per_sample_outs/sample/count/sample_filtered_feature_bc_matrix.h5" doc: | - Filtered feature-barcode matrices containing only cellular - barcodes in HDF5 format. Each element of the matrix is the number of UMIs - associated with a feature (row) and a barcode (column). + Filtered feature-barcode matrices + containing only cellular barcodes + in HDF5 format. Each element of the + matrix is the number of UMIs + associated with a feature (row) and a + barcode (column). raw_feature_bc_matrices_folder: type: Directory outputBinding: glob: "sample/outs/multi/count/raw_feature_bc_matrix" doc: | - Folder with unfiltered feature-barcode matrices containing all barcodes - in MEX format. Each element of the matrix is the number of UMIs associated - with a feature (row) and a barcode (column). + Folder with unfiltered feature-barcode + matrices containing all barcodes in MEX + format. Each element of the matrix is + the number of UMIs associated with a + feature (row) and a barcode (column). raw_feature_bc_matrices_h5: type: File outputBinding: glob: "sample/outs/multi/count/raw_feature_bc_matrix.h5" doc: | - Unfiltered feature-barcode matrices containing all barcodes in HDF5 format. - Each element of the matrix is the number of UMIs associated with a feature - (row) and a barcode (column). + Unfiltered feature-barcode matrices + containing all barcodes in HDF5 format. + Each element of the matrix is the number + of UMIs associated with a feature (row) + and a barcode (column). secondary_analysis_report_folder: type: Directory outputBinding: glob: "sample/outs/per_sample_outs/sample/count/analysis" doc: | - Folder with secondary analysis of GEX data including dimensionality reduction, - cell clustering, and differential expression + Folder with secondary analysis of + RNA data including dimensionality + reduction, cell clustering, and + differential expression. loupe_browser_track: type: File outputBinding: glob: "sample/outs/per_sample_outs/sample/count/sample_cloupe.cloupe" doc: | - Loupe Browser visualization and analysis file + Loupe Browser visualization + and analysis file. all_contig_reads_bam_bai: type: File @@ -277,14 +328,21 @@ outputs: secondaryFiles: - .bai doc: | - Indexed V(D)J BAM file with reads aligned to ALL assembled contigs, per cell barcode. - This file demonstrates how the reads and UMIs support the assembled contigs within - a cell barcode. Reads are not aligned across cell barcode boundaries. Please note - that this BAM excludes reads whose barcodes don't match the whitelist, so it is not - suitable as an archive of every single input read. - This file includes reads from all cells barcodes identified by V(D)J algorithm including - those ones that will be later discarded as non-viable cells by V(D)J algorithm and those - barcodes that will be later removed after overlapping with cells called by GEX algorithm. + Indexed V(D)J BAM file with reads aligned + to ALL assembled contigs, per cell barcode. + This file demonstrates how the reads and UMIs + support the assembled contigs within a cell + barcode. Reads are not aligned across cell + barcode boundaries. Please note that this BAM + excludes reads whose barcodes don't match the + whitelist, so it is not suitable as an archive + of every single input read. This file includes + reads from all cells barcodes identified by + V(D)J algorithm including those ones that will + be later discarded as non-viable cells by V(D)J + algorithm and those barcodes that will be later + removed after overlapping with cells called by + RNA algorithm. all_contig_sequences_fasta: type: File @@ -293,57 +351,73 @@ outputs: secondaryFiles: - .fai doc: | - FASTA format sequence for ALL assembled contigs in the V(D)J library. - This file includes both productive and non-productive contigs with high and low confidence - assembled for all identified cells barcodes including those ones that will be later discarded - as non-viable cells by V(D)J algorithm or after overlapping with cells called by GEX algorithm. + FASTA format sequence for ALL assembled contigs + in the V(D)J library. This file includes both + productive and non-productive contigs with high + and low confidence assembled for all identified + cells barcodes including those ones that will be + later discarded as non-viable cells by V(D)J + algorithm or after overlapping with cells called + by RNA algorithm. all_contig_annotations_bed: type: File outputBinding: glob: "sample/outs/multi/vdj_*/all_contig_annotations.bed" doc: | - BED file with high-level and detailed annotations of ALL assembled contigs (from cell and - background barcodes). Used for further investigation into why some contigs were filtered - out. This file includes both productive and non-productive contigs with high and low - confidence assembled for all identified cells barcodes including those ones that will be - later discarded as non-viable cells by V(D)J algorithm or after overlapping with cells - called by GEX algorithm. + BED file with high-level and detailed annotations + of ALL assembled contigs (from cell and background + barcodes). Used for further investigation into why + some contigs were filtered out. This file includes + both productive and non-productive contigs with high + and low confidence assembled for all identified cells + barcodes including those ones that will be later + discarded as non-viable cells by V(D)J algorithm or + after overlapping with cells called by RNA algorithm. all_contig_annotations_csv: type: File outputBinding: glob: "sample/outs/multi/vdj_*/all_contig_annotations.csv" doc: | - CSV file with high-level and detailed annotations of ALL assembled contigs (from cell and - background barcodes). Used for further investigation into why some contigs were filtered - out. This file includes both productive and non-productive contigs with high and low - confidence assembled for all identified cells barcodes including those ones that will be - later discarded as non-viable cells by V(D)J algorithm or after overlapping with cells - called by GEX algorithm. + CSV file with high-level and detailed annotations + of ALL assembled contigs (from cell and background + barcodes). Used for further investigation into why + some contigs were filtered out. This file includes + both productive and non-productive contigs with high + and low confidence assembled for all identified cells + barcodes including those ones that will be later + discarded as non-viable cells by V(D)J algorithm or + after overlapping with cells called by RNA algorithm. airr_rearrangement_tsv: type: File outputBinding: glob: "sample/outs/per_sample_outs/sample/vdj_*/airr_rearrangement.tsv" doc: | - Annotated contigs and consensus sequences of V(D)J rearrangements - in the AIRR format. It includes only viable cells identified by - both V(D)J and GEX algorithms. + Annotated contigs and consensus sequences of V(D)J + rearrangements in the AIRR format. It includes only + viable cells identified by both V(D)J and RNA algorithms. clonotypes_csv: type: File outputBinding: glob: "sample/outs/per_sample_outs/sample/vdj_*/clonotypes.csv" doc: | - CSV file with high-level descriptions of each clonotype. During the clonotype - grouping stage, cell barcodes are placed in groups called clonotypes. Only viable - cells identified by both V(D)J and GEX algorithms are used. Each clonotype consists - of all descendants of a single, fully rearranged common ancestor, as approximated - computationally. During this process, some cell barcodes are flagged as likely - artifacts and filtered out, meaning that they are no longer called as cells. - However, as clonotype grouping stage is hapenning before forming the final version - of files in the per_sample_outs folder, the reported cells number won't be affected. + CSV file with high-level descriptions of each + clonotype. During the clonotype grouping stage, + cell barcodes are placed in groups called + clonotypes. Only viable cells identified by both + V(D)J and RNA algorithms are used. Each clonotype + consists of all descendants of a single, fully + rearranged common ancestor, as approximated + computationally. During this process, some cell + barcodes are flagged as likely artifacts and + filtered out, meaning that they are no longer + called as cells. However, as clonotype grouping + stage is hapenning before forming the final version + of files in the per_sample_outs folder, the reported + cells number won't be affected. germline_contigs_bam_bai: type: File @@ -352,13 +426,15 @@ outputs: secondaryFiles: - .bai doc: | - Indexed V(D)J BAM file with contigs aligned to concatenated germline - segments. For each clonotype consensus, the reference sequence is the - annotated germline segments concatenated together. This file shows how - both the per-cell contigs and the clonotype consensus contig relate to - the germline reference. Useful for revealing polymorphisms, somatic - mutations, and recombination-induced differences such as non-templated - nucleotide additions. + Indexed V(D)J BAM file with contigs aligned to + concatenated germline segments. For each clonotype + consensus, the reference sequence is the annotated + germline segments concatenated together. This file + shows how both the per-cell contigs and the clonotype + consensus contig relate to the germline reference. + Useful for revealing polymorphisms, somatic mutations, + and recombination-induced differences such as + non-templated nucleotide additions. germline_sequences_fasta: type: File @@ -367,9 +443,10 @@ outputs: secondaryFiles: - .fai doc: | - Concatenated V(D)J reference segments for the segments detected on each - consensus sequence. These serve as an approximate reference for each - consensus sequence. + Concatenated V(D)J reference segments for the + segments detected on each consensus sequence. + These serve as an approximate reference for + each consensus sequence. consensus_contigs_bam_bai: type: File @@ -378,10 +455,12 @@ outputs: secondaryFiles: - .bai doc: | - Indexed V(D)J BAM file with contigs aligned to clonotype consensus. - Each "reference" sequence is a clonotype consensus sequence, and each - record is an alignment of a single cell's contig against this consensus. - This file shows, for a clonotype consensus sequences, how the constituent + Indexed V(D)J BAM file with contigs aligned to + clonotype consensus. Each "reference" sequence + is a clonotype consensus sequence, and each + record is an alignment of a single cell's contig + against this consensus. This file shows, for a + clonotype consensus sequences, how the constituent per-cell assemblies support the consensus. consensus_sequences_fasta: @@ -398,38 +477,43 @@ outputs: outputBinding: glob: "sample/outs/per_sample_outs/sample/vdj_*/consensus_annotations.csv" doc: | - CSV file with high-level and detailed annotations of each clonotype - consensus sequence. + CSV file with high-level and detailed annotations + of each clonotype consensus sequence. filtered_contig_annotations_csv: type: File outputBinding: glob: "sample/outs/per_sample_outs/sample/vdj_*/filtered_contig_annotations.csv" doc: | - CSV file with high-level annotations of each high-confidence contig from - cell-associated barcodes. This is a subset of all_contig_annotations.csv. + CSV file with high-level annotations of each + high-confidence contig from cell-associated + barcodes. This is a subset of + all_contig_annotations.csv. filtered_contig_sequences_fasta: type: File outputBinding: glob: "sample/outs/per_sample_outs/sample/vdj_*/filtered_contig.fasta" doc: | - FASTA format sequence for only high-confidence contigs in cell barcodes. + FASTA format sequence for only high-confidence + contigs in cell barcodes. loupe_vdj_browser_track: type: File outputBinding: glob: "sample/outs/per_sample_outs/sample/vdj_*/vloupe.vloupe" doc: | - Loupe V(D)J Browser visualization and analysis file + Loupe V(D)J Browser visualization + and analysis file filtered_data_folder: type: Directory outputBinding: glob: "./sample/outs/per_sample_outs/sample" doc: | - Folder containing filtered data, i.e., only cell-associated barcodes. - Used by cellranger aggr to aggregate samples for joint analysis. + Folder containing filtered data, i.e., only + cell-associated barcodes. Used by cellranger + aggr to aggregate samples for joint analysis. stdout_log: type: stdout @@ -451,9 +535,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cell Ranger Multi Gene Expression and V(D)J Repertoire Profiling" -s:name: "Cell Ranger Multi Gene Expression and V(D)J Repertoire Profiling" -s:alternateName: "Quantifies gene expression and performs profiling of V(D)J repertoire from a single GEM well" +label: "Cell Ranger Count (RNA+VDJ)" +s:name: "Cell Ranger Count (RNA+VDJ)" +s:alternateName: "Quantifies single-cell gene expression, performs V(D)J contigs assembly and clonotype calling of the sequencing data from a single 10x Genomics library in a combined manner" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-multi.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -491,11 +575,11 @@ s:creator: doc: | - Cell Ranger Multi Gene Expression and V(D)J Repertoire Profiling - ================================================================ + Cell Ranger Count (RNA+VDJ) - Quantifies gene expression and performs profiling of V(D)J repertoire - from a single GEM well. + Quantifies single-cell gene expression, performs V(D)J contigs + assembly and clonotype calling of the sequencing data from a + single 10x Genomics library in a combined manner. Parameters set by default: --disable-ui - no need in any UI when running in Docker container @@ -511,15 +595,26 @@ doc: | --maxjobs --jobinterval --overrides + --output-dir --uiport --noexit --nopreflight + No implemented parameters in the [gene-expression] section: + - r1-length - never used + - r2-length - never used + - chemistry - should be auto-estimated + - expect-cells - should be auto-estimated + - force-cells - not needed now + - include-introns - by default is true, which is good + - no-secondary - no reason to disable it + - check-library-compatibility - no reason to disable it + As for running cellranger aggr with cellranger multi outputs we need only per_sample_outs/sample folder that already includes all necessary files, there is no need to return the following files as separate outputs: - - sample_molecule_info.h5 - used for GEX aggregation + - sample_molecule_info.h5 - used for RNA aggregation - vdj_contig_info.pb - used for V(D)J aggregation Why do we need to rename input files? @@ -538,27 +633,29 @@ doc: | s:about: | Analyze multiplexed data or combined gene expression/immune profiling/feature barcode data - USAGE: - cellranger multi [OPTIONS] --id --csv - - OPTIONS: - --id A unique run id and output folder name [a-zA-Z0-9_-]+ - --description Sample description to embed in output files [default: ] - --csv Path of CSV file enumerating input libraries and analysis parameters - --dry Do not execute the pipeline. Generate a pipeline invocation (.mro) file and stop - --jobmode Job manager to use. Valid options: local (default), sge, lsf, slurm or path to a .template file. Search for help on "Cluster Mode" at - support.10xgenomics.com for more details on configuring the pipeline to use a compute cluster [default: local] - --localcores Set max cores the pipeline may request at one time. Only applies to local jobs - --localmem Set max GB the pipeline may request at one time. Only applies to local jobs - --localvmem Set max virtual address space in GB for the pipeline. Only applies to local jobs - --mempercore Reserve enough threads for each job to ensure enough memory will be available, assuming each core on your cluster has at least this much memory - available. Only applies to cluster jobmodes - --maxjobs Set max jobs submitted to cluster at one time. Only applies to cluster jobmodes - --jobinterval Set delay between submitting jobs to cluster, in ms. Only applies to cluster jobmodes - --overrides The path to a JSON file that specifies stage-level overrides for cores and memory. Finer-grained than --localcores, --mempercore and --localmem. - Consult https://support.10xgenomics.com/ for an example override file - --uiport Serve web UI at http://localhost:PORT - --disable-ui Do not serve the web UI - --noexit Keep web UI running after pipestance completes or fails - --nopreflight Skip preflight checks - -h, --help Print help information \ No newline at end of file + Usage: cellranger multi [OPTIONS] --id --csv + + Options: + --id A unique run id and output folder name [a-zA-Z0-9_-]+ + --description Sample description to embed in output files [default: ] + --csv Path of CSV file enumerating input libraries and analysis parameters + --dry Do not execute the pipeline. Generate a pipeline invocation (.mro) file and stop + --jobmode Job manager to use. Valid options: local (default), sge, lsf, slurm or path to a .template file. Search for + help on "Cluster Mode" at support.10xgenomics.com for more details on configuring the pipeline to use a + compute cluster + --localcores Set max cores the pipeline may request at one time. Only applies to local jobs + --localmem Set max GB the pipeline may request at one time. Only applies to local jobs + --localvmem Set max virtual address space in GB for the pipeline. Only applies to local jobs + --mempercore Reserve enough threads for each job to ensure enough memory will be available, assuming each core on your + cluster has at least this much memory available. Only applies to cluster jobmodes + --maxjobs Set max jobs submitted to cluster at one time. Only applies to cluster jobmodes + --jobinterval Set delay between submitting jobs to cluster, in ms. Only applies to cluster jobmodes + --overrides The path to a JSON file that specifies stage-level overrides for cores and memory. Finer-grained than + --localcores, --mempercore and --localmem. Consult https://support.10xgenomics.com/ for an example override + file + --output-dir Output the results to this directory + --uiport Serve web UI at http://localhost:PORT + --disable-ui Do not serve the web UI + --noexit Keep web UI running after pipestance completes or fails + --nopreflight Skip preflight checks + -h, --help Print help \ No newline at end of file diff --git a/tools/cellranger-reanalyze.cwl b/tools/cellranger-reanalyze.cwl index a872c23b..aefd19ca 100644 --- a/tools/cellranger-reanalyze.cwl +++ b/tools/cellranger-reanalyze.cwl @@ -5,7 +5,7 @@ class: CommandLineTool requirements: - class: InlineJavascriptRequirement - class: DockerRequirement - dockerPull: cumulusprod/cellranger:7.0.0 + dockerPull: cumulusprod/cellranger:8.0.1 hints: - class: InitialWorkDirRequirement @@ -43,8 +43,10 @@ inputs: position: 5 prefix: "--matrix" doc: | - A feature-barcode matrix containing data for one genome. - Should be the filtered version, unless using --force-cells + A feature-barcode matrix containing + data for one genome. Should be the + filtered version, unless using + --force-cells selected_barcodes: type: File? @@ -52,9 +54,11 @@ inputs: position: 6 prefix: "--barcodes" doc: | - A CSV file containing a list of cell barcodes to use for reanalysis, - e.g. barcodes exported from Loupe Browser. All barcodes must be present - in the matrix. + A CSV file containing a list of cell + barcodes to use for reanalysis, e.g. + barcodes exported from Loupe Browser. + All barcodes must be present in the + matrix. selected_genes: type: File? @@ -62,9 +66,11 @@ inputs: position: 7 prefix: "--genes" doc: | - A CSV file containing a list of gene IDs to use for reanalysis (corresponding - to the gene_id field of the reference GTF). All gene IDs must be present in - the matrix. Note that only gene features are used in secondary analysis. + A CSV file containing a list of gene + IDs to use for reanalysis (corresponding + to the gene_id field of the reference + GTF). All gene IDs must be present in + the matrix. excluded_genes: type: File? @@ -72,10 +78,12 @@ inputs: position: 8 prefix: "--exclude-genes" doc: | - A CSV file containing a list of gene IDs to exclude for reanalysis (corresponding - to the gene_id field of the reference GTF). All gene IDs must be present in - the matrix. The exclusion is applied after setting the gene list with --genes. - Note that only gene features are used in secondary analysis. + A CSV file containing a list of gene IDs + to exclude for reanalysis (corresponding + to the gene_id field of the reference GTF). + All gene IDs must be present in the matrix. + The exclusion is applied after setting the + gene list with --genes. force_cells: type: int? @@ -83,9 +91,12 @@ inputs: position: 9 prefix: "--force-cells" doc: | - Force pipeline to use this number of cells, bypassing the cell detection algorithm. - Use this if the number of cells estimated by Cell Ranger is not consistent with the - barcode rank plot. If specifying a value that exceeds the original cell count, you + Force pipeline to use this number of cells, + bypassing the cell detection algorithm. + Use this if the number of cells estimated + by Cell Ranger is not consistent with the + barcode rank plot. If specifying a value + that exceeds the original cell count, you must use the raw_gene_bc_matrices_h5.h5 threads: @@ -94,7 +105,8 @@ inputs: position: 10 prefix: "--localcores" doc: | - Set max cores the pipeline may request at one time. + Set max cores the pipeline may request + at one time. Default: all available memory_limit: @@ -103,7 +115,8 @@ inputs: position: 11 prefix: "--localmem" doc: | - Set max GB the pipeline may request at one time + Set max GB the pipeline may request + at one time Default: all available virt_memory_limit: @@ -112,189 +125,233 @@ inputs: position: 12 prefix: "--localvmem" doc: | - Set max virtual address space in GB for the pipeline + Set max virtual address space in + GB for the pipeline Default: all available num_analysis_bcs: type: int? doc: | - Randomly subset data to N barcodes for all analysis. Reduce this parameter if you - want to improve performance or simulate results from lower cell counts. Cannot be - set higher than the available number of cells. + Randomly subset data to N barcodes for all analysis. + Reduce this parameter if you want to improve + performance or simulate results from lower cell counts. + Cannot be set higher than the available number of cells. Default: null num_pca_bcs: type: int? doc: | - Randomly subset data to N barcodes when computing PCA projection (the most memory-intensive - step). The PCA projection will still be applied to the full dataset, i.e. your final results - will still reflect all the data. Try reducing this parameter if your analysis is running out - of memory. Cannot be set higher than the available number of cells. + Randomly subset data to N barcodes when computing + PCA projection (the most memory-intensive step). + The PCA projection will still be applied to the full + dataset, i.e. your final results will still reflect all + the data. Try reducing this parameter if your + analysis is running out of memory. Cannot be set + higher than the available number of cells. Default: null num_pca_genes: type: int? doc: | - Subset data to the top N genes (ranked by normalized dispersion) when computing PCA. - Differential expression will still reflect all genes. Try reducing this parameter if - your analysis is running out of memory. Cannot be set higher than the number of genes - in the reference transcriptome. + Subset data to the top N genes (ranked by normalized + dispersion) when computing PCA. Differential + expression will still reflect all genes. Try reducing + this parameter if your analysis is running out of memory. + Cannot be set higher than the number of genes in the + reference transcriptome. Default: null num_principal_comps: type: int? doc: | - Compute N principal components for PCA. Setting this too high may cause spurious clusters - to be called. The default value is 100 when the chemistry batch correction is enabled. - Set from 10 to 100, depending on the number of cell populations/clusters you expect to see. + Compute N principal components for PCA. Setting this + too high may cause spurious clusters to be called. The + default value is 100 when the chemistry batch correction + is enabled. Set from 10 to 100, depending on the number + of cell populations/clusters you expect to see. Default: 10 cbc_knn: type: int? doc: | - Specify the number of nearest neighbors used to identify mutual nearest neighbors. - Setting this too high will increase runtime and may cause out of memory error. - See Chemistry Batch Correction page for more details. Ranges from 5 to 20. + Specify the number of nearest neighbors used to identify + mutual nearest neighbors. Setting this too high will + increase runtime and may cause out of memory error. See + Chemistry Batch Correction page for more details. Ranges + from 5 to 20. Default: 10 cbc_alpha: type: float? doc: | - Specify the threshold of the percentage of matched cells between two batches, - which is used to determine if the batch pair will be merged. See Chemistry - Batch Correction page for more details. Ranges from 0.05 to 0.5. + Specify the threshold of the percentage of matched cells + between two batches, which is used to determine if the + batch pair will be merged. See Chemistry Batch Correction + page for more details. Ranges from 0.05 to 0.5. Default: 0.1 cbc_sigma: type: float? doc: | - Specify the bandwidth of the Gaussian smoothing kernel used to compute the correction - vector for each cell. See Chemistry Batch Correction page for more details. Ranges + Specify the bandwidth of the Gaussian smoothing kernel + used to compute the correction vector for each cell. See + Chemistry Batch Correction page for more details. Ranges from 10 to 500. Default: 150 cbc_realign_panorama: type: boolean? doc: | - Specify if two batches will be merged if they are already in the same panorama. Setting - this to True will usually improve the performance, but will also increase runtime and - memory usage. See Chemistry Batch Correction page for more details. One of true or false. + Specify if two batches will be merged if they are already + in the same panorama. Setting this to True will usually + improve performance, but will also increase runtime and + memory usage. See Chemistry Batch Correction page for + more details. One of true or false. Default: false graphclust_neighbors: type: int? doc: | - Number of nearest-neighbors to use in the graph-based clustering. Lower values result in - higher-granularity clustering. The actual number of neighbors used is the maximum of this - value and that determined by neighbor_a and neighbor_b. Set this value to zero to use those - values instead. Ranged from 10 to 500, depending on desired granularity. + Number of nearest-neighbors to use in the graph-based + clustering. Lower values result in higher-granularity + clustering. The actual number of neighbors used is the + maximum of this value and that determined by neighbor_a + and neighbor_b. Set this value to zero to use those + values instead. Ranged from 10 to 500, depending on + desired granularity. Default: 0 neighbor_a: type: float? doc: | - The number of nearest neighbors, k, used in the graph-based clustering is computed as follows: - k = neighbor_a + neighbor_b * log10(n_cells). The actual number of neighbors used is the maximum - of this value and graphclust_neighbors. Determines how clustering granularity scales with cell count. + The number of nearest neighbors, k, used in the graph-based + clustering is computed as follows: k = neighbor_a + neighbor_b * + log10(n_cells). The actual number of neighbors used is the maximum + of this value and graphclust_neighbors. Determines how clustering + granularity scales with cell count. Default: -230.0 neighbor_b: type: float? doc: | - The number of nearest neighbors, k, used in the graph-based clustering is computed as follows: - k = neighbor_a + neighbor_b * log10(n_cells). The actual number of neighbors used is the maximum of - this value and graphclust_neighbors. Determines how clustering granularity scales with cell count. + The number of nearest neighbors, k, used in the graph-based + clustering is computed as follows: k = neighbor_a + neighbor_b * + log10(n_cells). The actual number of neighbors used is the maximum + of this value and graphclust_neighbors. Determines how clustering + granularity scales with cell count. Default: 120.0 max_clusters: type: int? doc: | - Compute K-means clustering using K values of 2 to N. Setting this too high may cause spurious clusters - to be called. Ranges from 10 to 50, depending on the number of cell populations / clusters you expect to see. + Compute K-means clustering using K values of 2 to N. + Setting this too high may cause spurious clusters to be + called. Ranges from 10 to 50, depending on the number + of cell populations/clusters you expect to see. Default: 10 tsne_input_pcs: type: int? doc: | - Subset to top N principal components for TSNE. Change this parameter if you want to see how the TSNE plot - changes when using fewer PCs, independent of the clustering / differential expression. You may find that TSNE - is faster and/or the output looks better when using fewer PCs. Cannot be set higher than - the num_principal_comps parameter. + Subset to top N principal components for TSNE. Change + this parameter if you want to see how the TSNE plot + changes when using fewer PCs, independent of the + clustering/differential expression. You may find that + TSNE is faster and/or the output looks better when using + fewer PCs. Cannot be set higher than the num_principal_comps + parameter. Default: null tsne_perplexity: type: int? doc: | - TSNE perplexity parameter (see the TSNE FAQ for more details). When analyzing 100k+ cells, increasing this - parameter may improve TSNE results, but the algorithm will be slower. Ranges from 30 to 50. + TSNE perplexity parameter (see the TSNE FAQ for more details). + When analyzing 100k+ cells, increasing this parameter may + improve TSNE results, but the algorithm will be slower. + Ranges from 30 to 50. Default: 30 tsne_theta: type: float? doc: | - TSNE theta parameter (see the TSNE FAQ for more details). Higher values yield faster, more approximate results - (and vice versa). The runtime and memory performance of TSNE will increase dramatically if you set this below 0.25. + TSNE theta parameter (see the TSNE FAQ for more details). + Higher values yield faster, more approximate results (and + vice versa). The runtime and memory performance of TSNE + will increase dramatically if you set this below 0.25. Ranges from 0 to 1. Default: 0.5 tsne_max_dims: type: int? doc: | - Maximum number of TSNE output dimensions. Set this to 3 to produce both 2D and 3D TSNE projections - (note: runtime will increase significantly). Ranges from 2 to 3. + Maximum number of TSNE output dimensions. Set this to 3 to + produce both 2D and 3D TSNE projections (note: runtime will + increase significantly). Ranges from 2 to 3. Default: 2 tsne_max_iter: type: int? doc: | - Number of total TSNE iterations. Try increasing this if TSNE results do not look good on larger numbers - of cells. Runtime increases linearly with number of iterations. Ranges from 1000 to 10000. + Number of total TSNE iterations. Try increasing this if + TSNE results do not look good on larger numbers of cells. + Runtime increases linearly with the number of iterations. + Ranges from 1000 to 10000. Default: 1000 tsne_stop_lying_iter: type: int? doc: | - Iteration at which TSNE learning rate is reduced. Try increasing this if TSNE results do not look good - on larger numbers of cells. Cannot be set higher than tsne_max_iter. + Iteration at which TSNE learning rate is reduced. Try + increasing this if TSNE results do not look good on larger + numbers of cells. Cannot be set higher than tsne_max_iter. Default: 250 tsne_mom_switch_iter: type: int? doc: | - Iteration at which TSNE momentum is reduced. Try increasing this if TSNE results do not look good on - larger numbers of cells. Cannot be set higher than tsne_max_iter. Cannot be set higher than tsne_max_iter. + Iteration at which TSNE momentum is reduced. Try + increasing this if TSNE results do not look good on + larger numbers of cells. Cannot be set higher than + tsne_max_iter. Default: 250 umap_input_pcs: type: int? doc: | - Subset to top N principal components for UMAP. Change this parameter if you want to see how the UMAP plot - changes when using fewer PCs, independent of the clustering / differential expression. You may find that - UMAP is faster and/or the output looks better when using fewer PCs. Cannot be set higher than the + Subset to top N principal components for UMAP. Change + this parameter if you want to see how the UMAP plot + changes when using fewer PCs, independent of the + clustering/differential expression. You may find that + UMAP is faster and/or the output looks better when + using fewer PCs. Cannot be set higher than the num_principal_comps parameter. Default: null umap_n_neighbors: type: int? doc: | - Determines the number of neighboring points used in local approximations of manifold structure. - Larger values will usually result in more global structure at the loss of detailed local structure. - Ranges from 5 to 50. + Determines the number of neighboring points used in + local approximations of manifold structure. Larger values + will usually result in more global structure at the loss + of detailed local structure. Ranges from 5 to 50. Default: 30 umap_max_dims: type: int? doc: | - Maximum number of UMAP output dimensions. Set this to 3 to produce both 2D and 3D UMAP projections. - Ranges from 2 to 3. + Maximum number of UMAP output dimensions. Set this to 3 + to produce both 2D and 3D UMAP projections. Ranges from 2 + to 3. Default: 2 umap_min_dist: type: float? doc: | - Controls how tightly the embedding is allowed to pack points together. Larger values make embedded - points are more evenly distributed, while smaller values make the embedding more accurately with - regard to the local structure. Ranges from 0.001 to 0.5. + Controls how tightly the embedding is allowed to pack points + together. Larger values make embedded points more evenly + distributed, while smaller values make the embedding more + accurate with regard to the local structure. Ranges from + 0.001 to 0.5. Default: 0.3 umap_metric: @@ -330,9 +387,11 @@ inputs: random_seed: type: int? doc: | - Random seed. Due to the randomized nature of the algorithms, changing this will produce slightly - different results. If the TSNE or UMAP results don't look good, try running multiple times with - different seeds and pick the TSNE or UMAP that looks best. + Random seed. Due to the randomized nature of the algorithms, + changing this will produce slightly different results. If + the TSNE or UMAP results don't look good, try running + multiple times with different seeds and pick the TSNE or + UMAP that looks best. Default: 0 @@ -343,37 +402,40 @@ outputs: outputBinding: glob: "reanalyzed/outs/analysis" doc: | - Folder with secondary analysis results including dimensionality reduction, - cell clustering, and differential expression for reanalyzed results + Folder with secondary analysis results including + dimensionality reduction, cell clustering, and + differential expression for reanalyzed results. web_summary_report: type: File outputBinding: glob: "reanalyzed/outs/web_summary.html" doc: | - Reanalyzed run summary metrics and charts in HTML format + Reanalyzed run summary metrics and charts + in HTML format. filtered_feature_bc_matrix_folder: type: Directory outputBinding: glob: "reanalyzed/outs/filtered_feature_bc_matrix" doc: | - Folder with filtered feature-barcode matrices containing only cellular - barcodes in MEX format. + Folder with filtered feature-barcode matrices + containing only cellular barcodes in MEX format. reanalyze_params: type: File outputBinding: glob: "reanalyzed/outs/params.csv" doc: | - Copy of the input params CSV file + Copy of the input params CSV file. loupe_browser_track: type: File outputBinding: glob: "reanalyzed/outs/cloupe.cloupe" doc: | - Loupe Browser visualization and analysis file for reanalyzed results + Loupe Browser visualization and analysis + file for reanalyzed results. stdout_log: type: stdout @@ -399,11 +461,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Cell Ranger Reanalyze" -s:name: "Cell Ranger Reanalyze" -s:alternateName: | - Reruns secondary analysis performed on the GEX feature-barcode matrix (dimensionality reduction, - clustering and visualization) using different parameter settings +label: "Cellranger Reanalyze" +s:name: "Cellranger Reanalyze" +s:alternateName: "Reruns secondary analysis for Cell Ranger Count Gene Expression or Cell Ranger Multi experiments" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/cellranger-reanalyze.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -443,15 +503,17 @@ s:creator: doc: | Cell Ranger Reanalyze - Runs cellranger reanalyze command to rerun secondary analysis performed on the - GEX feature-barcode matrix (dimensionality reduction, clustering and visualization) - using different parameter settings. + Reruns secondary analysis for Cell Ranger Count + Gene Expression or Cell Ranger Multi experiments - Rerunning the analysis for aggregated experiments is not currently supported. + Rerunning the analysis for aggregated experiments + is not currently supported. Parameters set by default: - --disable-ui - no need in any UI when running in Docker container - --id - hardcoded to `reanalyzed` as we want to return the content of the + --disable-ui - no need in any UI when running in + Docker container + --id - hardcoded to `reanalyzed` as we want + to return the content of the output folder as separate outputs Skipped outputs as they are identical to inputs: @@ -459,8 +521,11 @@ doc: | Not implemented parameters: --description - not needed for now - --agg - we don't support reruning secondary analysis from aggregated samples - --dry - not applicable to our use case + --agg - we don't support reruning + secondary analysis from + the aggregated samples + --dry - not applicable to our use + case --jobmode - we use default local mode --mempercore - not used for local mode --maxjobs - not used for local mode @@ -468,42 +533,50 @@ doc: | --overrides - not needed for now --uiport - we disabled UI --noexit - we disabled UI - --nopreflight - no reason to skip preflight checks + --output-dir - not needed for now + --nopreflight - no reason to skip preflight + checks s:about: | Re-run secondary analysis (dimensionality reduction, clustering, etc) - USAGE: - cellranger reanalyze [OPTIONS] --id --matrix - - OPTIONS: - --id A unique run id and output folder name [a-zA-Z0-9_-]+ - --description Sample description to embed in output files [default: ] - --matrix A feature-barcode matrix containing data for one genome. Should be the filtered version, unless using --force-cells - --params A CSV file specifying analysis parameters. Optional - --barcodes A CSV file containing a list of cell barcodes to use for reanalysis, e.g. barcodes exported from Loupe Browser. Optional - --genes A CSV file containing a list of feature IDs to use for reanalysis. For gene expression, this should correspond to the gene_id field in the - reference GTF should be \(e.g. ENSG... for ENSEMBL-based references\). Optional - --exclude-genes A CSV file containing a list of feature IDs to exclude from reanalysis. For gene expression, this should correspond to the gene_id field in - the reference GTF \(e.g., ENSG... for ENSEMBL-based references\). The exclusion is applied after --genes. Optional - --agg If the input matrix was produced by 'aggr', you may pass the same aggregation CSV in order to retain per-library tag information in the - resulting .cloupe file. This argument is required to enable chemistry batch correction. Optional - --force-cells Force pipeline to use this number of cells, bypassing cell calling algorithm. [MINIMUM: 10] - --dry Do not execute the pipeline. Generate a pipeline invocation (.mro) file and stop - --jobmode Job manager to use. Valid options: local (default), sge, lsf, slurm or path to a .template file. Search for help on "Cluster Mode" at - support.10xgenomics.com for more details on configuring the pipeline to use a compute cluster [default: local] - --localcores Set max cores the pipeline may request at one time. Only applies to local jobs - --localmem Set max GB the pipeline may request at one time. Only applies to local jobs - --localvmem Set max virtual address space in GB for the pipeline. Only applies to local jobs - --mempercore Reserve enough threads for each job to ensure enough memory will be available, assuming each core on your cluster has at least this much - memory available. Only applies to cluster jobmodes - --maxjobs Set max jobs submitted to cluster at one time. Only applies to cluster jobmodes - --jobinterval Set delay between submitting jobs to cluster, in ms. Only applies to cluster jobmodes - --overrides The path to a JSON file that specifies stage-level overrides for cores and memory. Finer-grained than --localcores, --mempercore and - --localmem. Consult https://support.10xgenomics.com/ for an example override file - --uiport Serve web UI at http://localhost:PORT - --disable-ui Do not serve the web UI - --noexit Keep web UI running after pipestance completes or fails - --nopreflight Skip preflight checks - -h, --help Print help information \ No newline at end of file + Usage: cellranger reanalyze [OPTIONS] --id --matrix + + Options: + --id A unique run id and output folder name [a-zA-Z0-9_-]+ + --description Sample description to embed in output files [default: ] + --matrix A feature-barcode matrix containing data for one genome. Should be the filtered version, unless using + --force-cells + --params A CSV file specifying analysis parameters. Optional + --barcodes A CSV file containing a list of cell barcodes to use for reanalysis, e.g. barcodes exported from Loupe + Browser. Optional + --genes A CSV file containing a list of feature IDs to use for reanalysis. For gene expression, this should correspond + to the gene_id field in the reference GTF should be \(e.g. ENSG... for ENSEMBL-based references\). Optional + --exclude-genes A CSV file containing a list of feature IDs to exclude from reanalysis. For gene expression, this should + correspond to the gene_id field in the reference GTF \(e.g., ENSG... for ENSEMBL-based references\). The + exclusion is applied after --genes. Optional + --agg If the input matrix was produced by 'aggr', you may pass the same aggregation CSV in order to retain + per-library tag information in the resulting .cloupe file. This argument is required to enable chemistry + batch correction. Optional + --force-cells Force pipeline to use this number of cells, bypassing cell calling algorithm. [MINIMUM: 10] + --dry Do not execute the pipeline. Generate a pipeline invocation (.mro) file and stop + --jobmode Job manager to use. Valid options: local (default), sge, lsf, slurm or path to a .template file. Search for + help on "Cluster Mode" at support.10xgenomics.com for more details on configuring the pipeline to use a + compute cluster + --localcores Set max cores the pipeline may request at one time. Only applies to local jobs + --localmem Set max GB the pipeline may request at one time. Only applies to local jobs + --localvmem Set max virtual address space in GB for the pipeline. Only applies to local jobs + --mempercore Reserve enough threads for each job to ensure enough memory will be available, assuming each core on your + cluster has at least this much memory available. Only applies to cluster jobmodes + --maxjobs Set max jobs submitted to cluster at one time. Only applies to cluster jobmodes + --jobinterval Set delay between submitting jobs to cluster, in ms. Only applies to cluster jobmodes + --overrides The path to a JSON file that specifies stage-level overrides for cores and memory. Finer-grained than + --localcores, --mempercore and --localmem. Consult https://support.10xgenomics.com/ for an example override + file + --output-dir Output the results to this directory + --uiport Serve web UI at http://localhost:PORT + --disable-ui Do not serve the web UI + --noexit Keep web UI running after pipestance completes or fails + --nopreflight Skip preflight checks + -h, --help Print help \ No newline at end of file diff --git a/tools/fastq-dump.cwl b/tools/fastq-dump.cwl index 049ff015..feaf5100 100644 --- a/tools/fastq-dump.cwl +++ b/tools/fastq-dump.cwl @@ -10,7 +10,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/fastqdwnld:v0.0.3 + dockerPull: biowardrobe2/fastqdwnld:v0.0.4 inputs: @@ -168,10 +168,10 @@ outputs: return (!!splitted_line.length)?splitted_line:null; } - log_stdout: + stdout_log: type: stdout - log_stderr: + stderr_log: type: stderr @@ -187,9 +187,9 @@ $namespaces: $schemas: - https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf -label: "Fastq-Dump on Steroids" -s:name: "Fastq-Dump on Steroids" -s:alternateName: "Downloads FASTQ files from the provided SRR identifier" +label: "FASTQ Download" +s:name: "FASTQ Download" +s:alternateName: "Assists in downloading problematic single-cell sequencing data from Sequence Read Archive (SRA)" s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/fastq-dump.cwl s:codeRepository: https://github.com/Barski-lab/workflows @@ -227,9 +227,10 @@ s:creator: doc: | - Fastq-Dump on Steroids + FASTQ Download - Downloads FASTQ files from the provided SRR identifier + Assists in downloading problematic single-cell sequencing + data from Sequence Read Archive (SRA) s:about: | diff --git a/workflows/cellranger-aggr.cwl b/workflows/cellranger-aggr.cwl index cb31e4e7..dda3657d 100644 --- a/workflows/cellranger-aggr.cwl +++ b/workflows/cellranger-aggr.cwl @@ -188,6 +188,15 @@ outputs: label: "Loupe V(D)J Browser visualization and analysis file" doc: "Loupe V(D)J Browser visualization and analysis file" + airr_rearrangement_tsv: + type: File? + outputSource: aggregate_counts/airr_rearrangement_tsv + label: "Annotated contigs and consensus sequences of V(D)J rearrangements in the AIRR format" + doc: | + Annotated contigs and consensus sequences of V(D)J + rearrangements in the AIRR format. It includes only + viable cells identified by both V(D)J and RNA algorithms. + compressed_html_data_folder: type: File outputSource: compress_html_data_folder/compressed_folder @@ -250,6 +259,7 @@ steps: - consensus_annotations_csv - filtered_contig_annotations_csv - loupe_vdj_browser_track + - airr_rearrangement_tsv - stdout_log - stderr_log diff --git a/workflows/cellranger-arc-aggr.cwl b/workflows/cellranger-arc-aggr.cwl index e1195fe4..1eb81df2 100644 --- a/workflows/cellranger-arc-aggr.cwl +++ b/workflows/cellranger-arc-aggr.cwl @@ -24,7 +24,7 @@ inputs: sd:preview: position: 1 - gex_molecule_info_h5: + rna_molecule_info_h5: type: File[] label: "Cell Ranger RNA+ATAC Sample" doc: | @@ -32,7 +32,7 @@ inputs: that produces both gene expression and chromatin accessibility data from a single 10x Genomics library - "sd:upstreamSource": "sc_arc_sample/gex_molecule_info_h5" + "sd:upstreamSource": "sc_arc_sample/rna_molecule_info_h5" "sd:localLabel": true gem_well_labels: @@ -63,6 +63,10 @@ inputs: "sd:upstreamSource": "genome_indices/arc_indices_folder" "sd:localLabel": true + annotation_gtf_file: + type: File + "sd:upstreamSource": "genome_indices/genome_indices/annotation_gtf" + memory_limit: type: int? default: 20 @@ -296,7 +300,7 @@ steps: in: atac_fragments_file_from_count: atac_fragments_file_from_count barcode_metrics_report: barcode_metrics_report - gex_molecule_info_h5: gex_molecule_info_h5 + rna_molecule_info_h5: rna_molecule_info_h5 gem_well_labels: gem_well_labels indices_folder: indices_folder normalization_mode: normalization_mode @@ -348,6 +352,7 @@ steps: in: secondary_analysis_report_folder: aggregate_counts/secondary_analysis_report_folder filtered_feature_bc_matrix_folder: aggregate_counts/filtered_feature_bc_matrix_folder + annotation_gtf_file: annotation_gtf_file aggregation_metadata: aggregate_counts/aggregation_metadata out: - html_data diff --git a/workflows/cellranger-arc-count.cwl b/workflows/cellranger-arc-count.cwl index 17c69123..d24d1340 100644 --- a/workflows/cellranger-arc-count.cwl +++ b/workflows/cellranger-arc-count.cwl @@ -36,12 +36,16 @@ inputs: "sd:upstreamSource": "genome_indices/arc_indices_folder" "sd:localLabel": true + annotation_gtf_file: + type: File + "sd:upstreamSource": "genome_indices/genome_indices/annotation_gtf" + memory_limit: type: int? default: 20 "sd:upstreamSource": "genome_indices/memory_limit" - gex_fastq_file_r1: + rna_fastq_file_r1: type: - File - type: array @@ -54,7 +58,7 @@ inputs: If multiple files provided they will be merged. - gex_fastq_file_r2: + rna_fastq_file_r2: type: - File - type: array @@ -166,9 +170,9 @@ outputs: tab: "Overview" target: "_blank" - fastqc_report_gex_fastq_r1: + fastqc_report_rna_fastq_r1: type: File - outputSource: run_fastqc_for_gex_fastq_r1/html_file + outputSource: run_fastqc_for_rna_fastq_r1/html_file label: "QC report (RNA FASTQ, Read 1)" doc: | FastqQC report generated for @@ -178,9 +182,9 @@ outputs: tab: "Overview" target: "_blank" - fastqc_report_gex_fastq_r2: + fastqc_report_rna_fastq_r2: type: File - outputSource: run_fastqc_for_gex_fastq_r2/html_file + outputSource: run_fastqc_for_rna_fastq_r2/html_file label: "QC report (RNA FASTQ, Read 2)" doc: | FastqQC report generated for @@ -249,9 +253,9 @@ outputs: identified as a cell-associated partition by the pipeline. - gex_possorted_genome_bam_bai: + rna_possorted_genome_bam_bai: type: File - outputSource: generate_counts_matrix/gex_possorted_genome_bam_bai + outputSource: generate_counts_matrix/rna_possorted_genome_bam_bai label: "RNA reads" doc: | Genome track of RNA reads aligned to @@ -358,9 +362,9 @@ outputs: clustering results above and linkage between ATAC and RNA data. - gex_molecule_info_h5: + rna_molecule_info_h5: type: File - outputSource: generate_counts_matrix/gex_molecule_info_h5 + outputSource: generate_counts_matrix/rna_molecule_info_h5 label: "RNA molecule-level data" doc: | Count and barcode information for @@ -480,19 +484,19 @@ outputs: steps: - extract_gex_fastq_r1: + extract_rna_fastq_r1: run: ../tools/extract-fastq.cwl in: - compressed_file: gex_fastq_file_r1 + compressed_file: rna_fastq_file_r1 output_prefix: default: "rna_read_1" out: - fastq_file - extract_gex_fastq_r2: + extract_rna_fastq_r2: run: ../tools/extract-fastq.cwl in: - compressed_file: gex_fastq_file_r2 + compressed_file: rna_fastq_file_r2 output_prefix: default: "rna_read_2" out: @@ -525,20 +529,20 @@ steps: out: - fastq_file - run_fastqc_for_gex_fastq_r1: + run_fastqc_for_rna_fastq_r1: run: ../tools/fastqc.cwl in: - reads_file: extract_gex_fastq_r1/fastq_file + reads_file: extract_rna_fastq_r1/fastq_file threads: source: threads valueFrom: $(parseInt(self)) out: - html_file - run_fastqc_for_gex_fastq_r2: + run_fastqc_for_rna_fastq_r2: run: ../tools/fastqc.cwl in: - reads_file: extract_gex_fastq_r2/fastq_file + reads_file: extract_rna_fastq_r2/fastq_file threads: source: threads valueFrom: $(parseInt(self)) @@ -578,8 +582,8 @@ steps: generate_counts_matrix: run: ../tools/cellranger-arc-count.cwl in: - gex_fastq_file_r1: extract_gex_fastq_r1/fastq_file - gex_fastq_file_r2: extract_gex_fastq_r2/fastq_file + rna_fastq_file_r1: extract_rna_fastq_r1/fastq_file + rna_fastq_file_r2: extract_rna_fastq_r2/fastq_file atac_fastq_file_r1: extract_atac_fastq_r1/fastq_file atac_fastq_file_r2: extract_atac_fastq_r2/fastq_file atac_fastq_file_r3: extract_atac_fastq_r3/fastq_file @@ -594,14 +598,14 @@ steps: - web_summary_report - metrics_summary_report - barcode_metrics_report - - gex_possorted_genome_bam_bai + - rna_possorted_genome_bam_bai - atac_possorted_genome_bam_bai - filtered_feature_bc_matrix_folder - filtered_feature_bc_matrix_h5 - raw_feature_bc_matrices_folder - raw_feature_bc_matrices_h5 - secondary_analysis_report_folder - - gex_molecule_info_h5 + - rna_molecule_info_h5 - loupe_browser_track - atac_fragments_file - atac_peaks_bed_file @@ -645,6 +649,7 @@ steps: in: secondary_analysis_report_folder: generate_counts_matrix/secondary_analysis_report_folder filtered_feature_bc_matrix_folder: generate_counts_matrix/filtered_feature_bc_matrix_folder + annotation_gtf_file: annotation_gtf_file out: - html_data - index_html_file diff --git a/workflows/cellranger-atac-aggr.cwl b/workflows/cellranger-atac-aggr.cwl index 1553a264..94b88ee9 100644 --- a/workflows/cellranger-atac-aggr.cwl +++ b/workflows/cellranger-atac-aggr.cwl @@ -67,6 +67,10 @@ inputs: "sd:upstreamSource": "genome_indices/arc_indices_folder" "sd:localLabel": true + annotation_gtf_file: + type: File + "sd:upstreamSource": "genome_indices/genome_indices/annotation_gtf" + memory_limit: type: int? default: 20 @@ -309,6 +313,7 @@ steps: secondary_analysis_report_folder: aggregate_counts/secondary_analysis_report_folder filtered_feature_bc_matrix_folder: aggregate_counts/filtered_feature_bc_matrix_folder aggregation_metadata: aggregate_counts/aggregation_metadata + annotation_gtf_file: annotation_gtf_file out: - html_data - index_html_file diff --git a/workflows/cellranger-atac-count.cwl b/workflows/cellranger-atac-count.cwl index 72705def..24b38b73 100644 --- a/workflows/cellranger-atac-count.cwl +++ b/workflows/cellranger-atac-count.cwl @@ -36,6 +36,10 @@ inputs: "sd:upstreamSource": "genome_indices/arc_indices_folder" "sd:localLabel": true + annotation_gtf_file: + type: File + "sd:upstreamSource": "genome_indices/genome_indices/annotation_gtf" + memory_limit: type: int? default: 20 @@ -507,6 +511,7 @@ steps: in: secondary_analysis_report_folder: generate_counts_matrix/secondary_analysis_report_folder filtered_feature_bc_matrix_folder: generate_counts_matrix/filtered_feature_bc_matrix_folder + annotation_gtf_file: annotation_gtf_file out: - html_data - index_html_file diff --git a/workflows/cellranger-mkvdjref.cwl b/workflows/cellranger-mkvdjref.cwl index 439e2618..b69ad4d5 100644 --- a/workflows/cellranger-mkvdjref.cwl +++ b/workflows/cellranger-mkvdjref.cwl @@ -33,6 +33,36 @@ inputs: For example: https://ftp.ensembl.org/pub/current_gtf/homo_sapiens/Homo_sapiens.GRCh38.108.gtf.gz + memory_limit: + type: int? + default: 20 + label: "Maximum memory used (GB)" + doc: | + Maximum memory used (GB). + "sd:layout": + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + - "2" + - "3" + - "4" + - "5" + - "6" + default: "4" + label: "Cores/CPUs" + doc: | + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 4 + "sd:layout": + advanced: true + outputs: @@ -84,6 +114,10 @@ steps: in: genome_fasta_file: extract_fasta/extracted_file annotation_gtf_file: extract_gtf/extracted_file + threads: + source: threads + valueFrom: $(parseInt(self)) + memory_limit: memory_limit output_folder_name: default: "cellranger_vdj_ref" out: diff --git a/workflows/cellranger-multi.cwl b/workflows/cellranger-multi.cwl index 35de8b4d..ff211982 100644 --- a/workflows/cellranger-multi.cwl +++ b/workflows/cellranger-multi.cwl @@ -24,7 +24,7 @@ inputs: sd:preview: position: 1 - gex_indices_folder: + rna_indices_folder: type: Directory label: "Cell Ranger Reference Sample" doc: | @@ -56,23 +56,23 @@ inputs: "sd:upstreamSource": "vdj_indices/indices_folder" "sd:localLabel": true - gex_fastq_file_r1: + rna_fastq_file_r1: type: - File - type: array items: File format: "http://edamontology.org/format_1930" - label: "GEX FASTQ file(s) R1 (optionally compressed)" - doc: "GEX FASTQ file(s) R1 (optionally compressed)" + label: "RNA FASTQ file(s) R1 (optionally compressed)" + doc: "RNA FASTQ file(s) R1 (optionally compressed)" - gex_fastq_file_r2: + rna_fastq_file_r2: type: - File - type: array items: File format: "http://edamontology.org/format_1930" - label: "GEX FASTQ file(s) R2 (optionally compressed)" - doc: "GEX FASTQ file(s) R2 (optionally compressed)" + label: "RNA FASTQ file(s) R2 (optionally compressed)" + doc: "RNA FASTQ file(s) R2 (optionally compressed)" vdj_fastq_file_r1: type: @@ -123,7 +123,7 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Cores/CPUs" doc: | Parallelization parameter to define the @@ -136,23 +136,23 @@ inputs: outputs: - fastqc_report_gex_fastq_r1: + fastqc_report_rna_fastq_r1: type: File - outputSource: run_fastqc_for_gex_fastq_r1/html_file - label: "FastqQC report for GEX FASTQ file R1" + outputSource: run_fastqc_for_rna_fastq_r1/html_file + label: "FastqQC report for RNA FASTQ file R1" doc: | - FastqQC report for GEX FASTQ file R1 + FastqQC report for RNA FASTQ file R1 "sd:visualPlugins": - linkList: tab: "Overview" target: "_blank" - fastqc_report_gex_fastq_r2: + fastqc_report_rna_fastq_r2: type: File - outputSource: run_fastqc_for_gex_fastq_r2/html_file - label: "FastqQC report for GEX FASTQ file R2" + outputSource: run_fastqc_for_rna_fastq_r2/html_file + label: "FastqQC report for RNA FASTQ file R2" doc: | - FastqQC report for GEX FASTQ file R2 + FastqQC report for RNA FASTQ file R2 "sd:visualPlugins": - linkList: tab: "Overview" @@ -207,7 +207,7 @@ outputs: outputSource: cellranger_multi/possorted_genome_bam_bai label: "Unaligned and aligned to the genome and transcriptome indexed reads" doc: | - Indexed GEX BAM file containing position-sorted reads aligned + Indexed RNA BAM file containing position-sorted reads aligned to the genome and transcriptome, as well as unaligned reads. filtered_feature_bc_matrix_folder: @@ -250,9 +250,9 @@ outputs: secondary_analysis_report_folder: type: File outputSource: compress_secondary_analysis_report_folder/compressed_folder - label: "Folder with secondary analysis of GEX data" + label: "Folder with secondary analysis of RNA data" doc: | - Folder with secondary analysis of GEX data including dimensionality + Folder with secondary analysis of RNA data including dimensionality reduction, cell clustering, and differential expression loupe_browser_track: @@ -274,7 +274,7 @@ outputs: suitable as an archive of every single input read. This file includes reads from all cells barcodes identified by V(D)J algorithm including those ones that will be later discarded as non-viable cells by V(D)J algorithm and those - barcodes that will be later removed after overlapping with cells called by GEX algorithm. + barcodes that will be later removed after overlapping with cells called by RNA algorithm. all_contig_sequences_fasta: type: File @@ -284,7 +284,7 @@ outputs: FASTA format sequence for ALL assembled contigs in the V(D)J library. This file includes both productive and non-productive contigs with high and low confidence assembled for all identified cells barcodes including those ones that will be later discarded - as non-viable cells by V(D)J algorithm or after overlapping with cells called by GEX algorithm. + as non-viable cells by V(D)J algorithm or after overlapping with cells called by RNA algorithm. all_contig_annotations_bed: type: File @@ -296,7 +296,7 @@ outputs: out. This file includes both productive and non-productive contigs with high and low confidence assembled for all identified cells barcodes including those ones that will be later discarded as non-viable cells by V(D)J algorithm or after overlapping with cells - called by GEX algorithm. + called by RNA algorithm. all_contig_annotations_csv: type: File @@ -308,7 +308,7 @@ outputs: out. This file includes both productive and non-productive contigs with high and low confidence assembled for all identified cells barcodes including those ones that will be later discarded as non-viable cells by V(D)J algorithm or after overlapping with cells - called by GEX algorithm. + called by RNA algorithm. airr_rearrangement_tsv: type: File @@ -317,7 +317,7 @@ outputs: doc: | Annotated contigs and consensus sequences of V(D)J rearrangements in the AIRR format. It includes only viable cells identified by - both V(D)J and GEX algorithms. + both V(D)J and RNA algorithms. clonotypes_tsv: type: File @@ -326,7 +326,7 @@ outputs: doc: | TSV file with high-level descriptions of each clonotype. During the clonotype grouping stage, cell barcodes are placed in groups called clonotypes. Only viable - cells identified by both V(D)J and GEX algorithms are used. Each clonotype consists + cells identified by both V(D)J and RNA algorithms are used. Each clonotype consists of all descendants of a single, fully rearranged common ancestor, as approximated computationally. During this process, some cell barcodes are flagged as likely artifacts and filtered out, meaning that they are no longer called as cells. @@ -450,21 +450,21 @@ outputs: steps: - extract_gex_fastq_r1: + extract_rna_fastq_r1: run: ../tools/extract-fastq.cwl in: - compressed_file: gex_fastq_file_r1 + compressed_file: rna_fastq_file_r1 output_prefix: - default: "gex_read_1" + default: "rna_read_1" out: - fastq_file - extract_gex_fastq_r2: + extract_rna_fastq_r2: run: ../tools/extract-fastq.cwl in: - compressed_file: gex_fastq_file_r2 + compressed_file: rna_fastq_file_r2 output_prefix: - default: "gex_read_2" + default: "rna_read_2" out: - fastq_file @@ -486,20 +486,20 @@ steps: out: - fastq_file - run_fastqc_for_gex_fastq_r1: + run_fastqc_for_rna_fastq_r1: run: ../tools/fastqc.cwl in: - reads_file: extract_gex_fastq_r1/fastq_file + reads_file: extract_rna_fastq_r1/fastq_file threads: source: threads valueFrom: $(parseInt(self)) out: - html_file - run_fastqc_for_gex_fastq_r2: + run_fastqc_for_rna_fastq_r2: run: ../tools/fastqc.cwl in: - reads_file: extract_gex_fastq_r2/fastq_file + reads_file: extract_rna_fastq_r2/fastq_file threads: source: threads valueFrom: $(parseInt(self)) @@ -529,11 +529,11 @@ steps: cellranger_multi: run: ../tools/cellranger-multi.cwl in: - gex_fastq_file_r1: extract_gex_fastq_r1/fastq_file - gex_fastq_file_r2: extract_gex_fastq_r2/fastq_file + rna_fastq_file_r1: extract_rna_fastq_r1/fastq_file + rna_fastq_file_r2: extract_rna_fastq_r2/fastq_file vdj_fastq_file_r1: extract_vdj_fastq_r1/fastq_file vdj_fastq_file_r2: extract_vdj_fastq_r2/fastq_file - gex_indices_folder: gex_indices_folder + rna_indices_folder: rna_indices_folder vdj_indices_folder: vdj_indices_folder vdj_chain_type: vdj_chain_type threads: From 45866e63c3f354257eab14e19fa5e614ad315f7e Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 12 Jul 2024 16:43:30 -0400 Subject: [PATCH 149/162] Refactor sc format transform workflow to not fail when run with Toil --- workflows/sc-format-transform.cwl | 61 ++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/workflows/sc-format-transform.cwl b/workflows/sc-format-transform.cwl index 9783c1a1..e2fe97ef 100644 --- a/workflows/sc-format-transform.cwl +++ b/workflows/sc-format-transform.cwl @@ -19,17 +19,18 @@ inputs: compressed_sparse_matrix: type: File - label: "Compressed folder with feature-barcode matrix in MEX format" + label: "TAR-gzipped folder with the feature-barcode matrix in MEX format" doc: | - Compressed folder with feature-barcode matrix from - Cell Ranger Count/Aggregate experiment in MEX format + Compressed folder with the feature-barcode + matrix from the Cell Ranger Count/Aggregate + experiment in MEX format (TAR-gzipped). metadata: type: File? - label: "Aggregation metadata in CSV format" + label: "Aggregation metadata in TSV format" doc: | - Aggregation metadata file from Cell Ranger - Aggregate experiment + Aggregation metadata file from the + Cell Ranger Aggregate experiment outputs: @@ -37,18 +38,19 @@ outputs: filtered_feature_bc_matrix_folder: type: File outputSource: pipe/filtered_feature_bc_matrix_folder - label: "Compressed folder with feature-barcode matrix in MEX format" + label: "TAR-gzipped folder with the feature-barcode matrix in MEX format" doc: | - Compressed folder with feature-barcode matrix from - Cell Ranger Count/Aggregate experiment in MEX format + Compressed folder with the feature-barcode + matrix from the Cell Ranger Count/Aggregate + experiment in MEX format (TAR-gzipped). aggregation_metadata: type: File? outputSource: pipe/aggregation_metadata - label: "Aggregation metadata in CSV format" + label: "Aggregation metadata in TSV format" doc: | - Aggregation metadata file from Cell Ranger - Aggregate experiment + Aggregation metadata file from the + Cell Ranger Aggregate experiment steps: @@ -56,24 +58,40 @@ steps: pipe: run: cwlVersion: v1.0 - class: ExpressionTool + class: CommandLineTool + hints: + - class: DockerRequirement + dockerPull: biowardrobe2/scidap:v0.0.3 inputs: + script: + type: string? + default: | + #!/bin/bash + RNDM_PREFIX=$(tr -dc a-z ./${RNDM_PREFIX}_aggr.tsv + fi + inputBinding: + position: 1 compressed_sparse_matrix: type: File + inputBinding: + position: 2 metadata: type: File? + inputBinding: + position: 3 outputs: filtered_feature_bc_matrix_folder: type: File + outputBinding: + glob: "*_bc_matrix.tar.gz" aggregation_metadata: type: File? - expression: | - ${ - return { - "filtered_feature_bc_matrix_folder": inputs.compressed_sparse_matrix, - "aggregation_metadata": inputs.metadata - }; - } + outputBinding: + glob: "*_aggr.tsv" + baseCommand: [bash, '-c'] in: compressed_sparse_matrix: compressed_sparse_matrix metadata: metadata @@ -130,4 +148,5 @@ s:creator: doc: | Single-cell Format Transform - Transforms single-cell sequencing data formats into Cell Ranger like output + Transforms single-cell sequencing data formats + into Cell Ranger like output. From 0d543199577beca3a685edd73ae175c44f076f1f Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 25 Jul 2024 14:49:17 -0400 Subject: [PATCH 150/162] Prevent DiffBind from failing when generating BigBed track --- tools/custom-bedops.cwl | 92 +++++++++++++++++++++++++++++ workflows/diffbind-multi-factor.cwl | 19 +++++- workflows/diffbind.cwl | 18 +++++- 3 files changed, 125 insertions(+), 4 deletions(-) create mode 100644 tools/custom-bedops.cwl diff --git a/tools/custom-bedops.cwl b/tools/custom-bedops.cwl new file mode 100644 index 00000000..3a94db05 --- /dev/null +++ b/tools/custom-bedops.cwl @@ -0,0 +1,92 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/bedops:v2.4.34 + + +inputs: + + script: + type: string? + default: | + cat "$0" > `basename $0` + inputBinding: + position: 1 + + input_file: + type: + - File + - File[] + inputBinding: + position: 2 + + param: + type: + - string? + - string[] + inputBinding: + position: 3 + + +outputs: + + output_file: + type: File + outputBinding: + glob: "*" + + +baseCommand: [bash, '-c'] + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +s:name: "custom-bedops" +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/custom-bedops.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:michael.kotliar@cchmc.org + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + +doc: | + Tool to run custom script set as `script` + input with arguments from `param`. Based + on bedops Dockerfile. + +s:about: | + Custom bash script runner \ No newline at end of file diff --git a/workflows/diffbind-multi-factor.cwl b/workflows/diffbind-multi-factor.cwl index 5716b69d..e76141ec 100644 --- a/workflows/diffbind-multi-factor.cwl +++ b/workflows/diffbind-multi-factor.cwl @@ -887,15 +887,30 @@ steps: out: - sorted_file + overlap_with_chr_length: + run: ../tools/custom-bedops.cwl + in: + input_file: + - chrom_length_file + - sort_bed/sorted_file + script: + default: | + cat "$0" | awk '{print $1"\t0\t"$2}' | sort-bed - > temp_chrom_length.bed + cat "$1" | awk '$2 >= 0' > temp_sorted.bed + bedops --element-of 100% temp_sorted.bed temp_chrom_length.bed > `basename $1` + rm -f temp_chrom_length.bed temp_sorted.bed + out: + - output_file + bed_to_bigbed: run: ../tools/ucsc-bedtobigbed.cwl in: - input_bed: sort_bed/sorted_file + input_bed: overlap_with_chr_length/output_file bed_type: default: "bed4+5" chrom_length_file: chrom_length_file output_filename: - source: sort_bed/sorted_file + source: overlap_with_chr_length/output_file valueFrom: $(self.basename.split('.').slice(0,-1).join('.') + ".bigBed") out: - bigbed_file diff --git a/workflows/diffbind.cwl b/workflows/diffbind.cwl index ded5bbcd..6e046168 100644 --- a/workflows/diffbind.cwl +++ b/workflows/diffbind.cwl @@ -1038,15 +1038,29 @@ steps: default: ["1,1","2,2n"] out: [sorted_file] + overlap_with_chr_length: + run: ../tools/custom-bedops.cwl + in: + input_file: + - chrom_length_file + - sort_bed/sorted_file + script: + default: | + cat "$0" | awk '{print $1"\t0\t"$2}' | sort-bed - > temp_chrom_length.bed + cat "$1" | awk '$2 >= 0' > temp_sorted.bed + bedops --element-of 100% temp_sorted.bed temp_chrom_length.bed > `basename $1` + rm -f temp_chrom_length.bed temp_sorted.bed + out: [output_file] + bed_to_bigbed: run: ../tools/ucsc-bedtobigbed.cwl in: - input_bed: sort_bed/sorted_file + input_bed: overlap_with_chr_length/output_file bed_type: default: "bed4+5" chrom_length_file: chrom_length_file output_filename: - source: sort_bed/sorted_file + source: overlap_with_chr_length/output_file valueFrom: $(self.basename.split('.').slice(0,-1).join('.') + ".bigBed") out: [bigbed_file] From 1118ec16ffbe05705fdbb94090262636432c3626 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 29 Jul 2024 12:06:38 -0400 Subject: [PATCH 151/162] Export HTML log from all sc workflows --- tools/cellbrowser-build-altanalyze.cwl | 2 +- tools/cellbrowser-build-cellranger-arc.cwl | 2 +- tools/cellbrowser-build-cellranger-atac.cwl | 2 +- tools/cellbrowser-build-cellranger.cwl | 2 +- tools/sc-atac-cluster.cwl | 23 ++++++++++++++++-- tools/sc-atac-coverage.cwl | 23 ++++++++++++++++-- tools/sc-atac-dbinding.cwl | 23 ++++++++++++++++-- tools/sc-atac-filter.cwl | 23 ++++++++++++++++-- tools/sc-atac-reduce.cwl | 23 ++++++++++++++++-- tools/sc-ctype-assign.cwl | 23 ++++++++++++++++-- tools/sc-multiome-filter.cwl | 22 +++++++++++++++-- tools/sc-rna-cluster.cwl | 22 +++++++++++++++-- tools/sc-rna-da-cells.cwl | 22 +++++++++++++++-- tools/sc-rna-de-pseudobulk.cwl | 22 +++++++++++++++-- tools/sc-rna-filter.cwl | 22 +++++++++++++++-- tools/sc-rna-reduce.cwl | 23 ++++++++++++++++-- tools/sc-rna-trajectory.cwl | 22 +++++++++++++++-- tools/sc-triangulate.cwl | 23 ++++++++++++++++-- tools/sc-vdj-profile.cwl | 23 ++++++++++++++++-- tools/sc-wnn-cluster.cwl | 23 ++++++++++++++++-- workflows/sc-atac-cluster.cwl | 24 +++++++++++++++++++ workflows/sc-atac-coverage.cwl | 24 +++++++++++++++++++ workflows/sc-atac-dbinding.cwl | 24 +++++++++++++++++++ workflows/sc-atac-filter.cwl | 24 +++++++++++++++++++ workflows/sc-atac-reduce.cwl | 24 +++++++++++++++++++ workflows/sc-ctype-assign.cwl | 24 +++++++++++++++++++ workflows/sc-multiome-filter.cwl | 24 +++++++++++++++++++ workflows/sc-rna-cluster.cwl | 24 +++++++++++++++++++ workflows/sc-rna-da-cells.cwl | 24 +++++++++++++++++++ workflows/sc-rna-de-pseudobulk.cwl | 24 +++++++++++++++++++ workflows/sc-rna-filter.cwl | 24 +++++++++++++++++++ workflows/sc-rna-reduce.cwl | 24 +++++++++++++++++++ workflows/sc-rna-trajectory.cwl | 24 +++++++++++++++++++ workflows/sc-triangulate.cwl | 24 +++++++++++++++++++ workflows/sc-vdj-profile.cwl | 26 ++++++++++++++++++++- workflows/sc-wnn-cluster.cwl | 24 +++++++++++++++++++ 36 files changed, 719 insertions(+), 37 deletions(-) diff --git a/tools/cellbrowser-build-altanalyze.cwl b/tools/cellbrowser-build-altanalyze.cwl index ec2023b1..f92d46c6 100644 --- a/tools/cellbrowser-build-altanalyze.cwl +++ b/tools/cellbrowser-build-altanalyze.cwl @@ -4,7 +4,7 @@ class: CommandLineTool hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 requirements: diff --git a/tools/cellbrowser-build-cellranger-arc.cwl b/tools/cellbrowser-build-cellranger-arc.cwl index ace53c76..e0f7f60f 100644 --- a/tools/cellbrowser-build-cellranger-arc.cwl +++ b/tools/cellbrowser-build-cellranger-arc.cwl @@ -4,7 +4,7 @@ class: CommandLineTool hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 requirements: diff --git a/tools/cellbrowser-build-cellranger-atac.cwl b/tools/cellbrowser-build-cellranger-atac.cwl index 9f534536..192e4b36 100644 --- a/tools/cellbrowser-build-cellranger-atac.cwl +++ b/tools/cellbrowser-build-cellranger-atac.cwl @@ -4,7 +4,7 @@ class: CommandLineTool hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 requirements: diff --git a/tools/cellbrowser-build-cellranger.cwl b/tools/cellbrowser-build-cellranger.cwl index 9e23a627..0abc4e93 100644 --- a/tools/cellbrowser-build-cellranger.cwl +++ b/tools/cellbrowser-build-cellranger.cwl @@ -4,7 +4,7 @@ class: CommandLineTool hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 requirements: diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index 75e0bf4b..3321195a 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 inputs: @@ -230,6 +230,14 @@ inputs: doc: | Export results to UCSC Cell Browser. Default: false + export_html_report: + type: boolean? + default: false + doc: | + Export tehcnical report. HTML format. + Note, stdout will be less informative. + Default: false + output_prefix: type: string? inputBinding: @@ -506,6 +514,14 @@ outputs: Seurat object. H5AD format. + sc_report_html_file: + type: File? + outputBinding: + glob: "sc_report.html" + doc: | + Tehcnical report. + HTML format. + stdout_log: type: stdout @@ -513,7 +529,10 @@ outputs: type: stderr -baseCommand: ["sc_atac_cluster.R"] +baseCommand: ["Rscript"] +arguments: +- valueFrom: $(inputs.export_html_report?["/usr/local/bin/sc_report_wrapper.R", "/usr/local/bin/sc_atac_cluster.R"]:"/usr/local/bin/sc_atac_cluster.R") + stdout: sc_atac_cluster_stdout.log stderr: sc_atac_cluster_stderr.log diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index 245d6dee..1fb26969 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 inputs: @@ -88,6 +88,14 @@ inputs: Print debug information. Default: false + export_html_report: + type: boolean? + default: false + doc: | + Export tehcnical report. HTML format. + Note, stdout will be less informative. + Default: false + output_prefix: type: string? inputBinding: @@ -161,6 +169,14 @@ outputs: Genome coverage calculated for ATAC fragments in bigWig format. + sc_report_html_file: + type: File? + outputBinding: + glob: "sc_report.html" + doc: | + Tehcnical report. + HTML format. + stdout_log: type: stdout @@ -168,7 +184,10 @@ outputs: type: stderr -baseCommand: ["sc_atac_coverage.R"] +baseCommand: ["Rscript"] +arguments: +- valueFrom: $(inputs.export_html_report?["/usr/local/bin/sc_report_wrapper.R", "/usr/local/bin/sc_atac_coverage.R"]:"/usr/local/bin/sc_atac_coverage.R") + stdout: sc_atac_coverage_stdout.log stderr: sc_atac_coverage_stderr.log diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index 63ba725b..41c03c23 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 inputs: @@ -259,6 +259,14 @@ inputs: Print debug information. Default: false + export_html_report: + type: boolean? + default: false + doc: | + Export tehcnical report. HTML format. + Note, stdout will be less informative. + Default: false + output_prefix: type: string? inputBinding: @@ -535,6 +543,14 @@ outputs: in the group of cells defined by the --second and --groupby parameters. + sc_report_html_file: + type: File? + outputBinding: + glob: "sc_report.html" + doc: | + Tehcnical report. + HTML format. + stdout_log: type: stdout @@ -542,7 +558,10 @@ outputs: type: stderr -baseCommand: ["sc_atac_dbinding.R"] +baseCommand: ["Rscript"] +arguments: +- valueFrom: $(inputs.export_html_report?["/usr/local/bin/sc_report_wrapper.R", "/usr/local/bin/sc_atac_dbinding.R"]:"/usr/local/bin/sc_atac_dbinding.R") + stdout: sc_atac_dbinding_stdout.log stderr: sc_atac_dbinding_stderr.log diff --git a/tools/sc-atac-filter.cwl b/tools/sc-atac-filter.cwl index a027da83..9144ffc8 100644 --- a/tools/sc-atac-filter.cwl +++ b/tools/sc-atac-filter.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 inputs: @@ -298,6 +298,14 @@ inputs: doc: | Export results to UCSC Cell Browser. Default: false + export_html_report: + type: boolean? + default: false + doc: | + Export tehcnical report. HTML format. + Note, stdout will be less informative. + Default: false + output_prefix: type: string? inputBinding: @@ -811,6 +819,14 @@ outputs: Seurat object. H5AD format + sc_report_html_file: + type: File? + outputBinding: + glob: "sc_report.html" + doc: | + Tehcnical report. + HTML format. + stdout_log: type: stdout @@ -818,7 +834,10 @@ outputs: type: stderr -baseCommand: ["sc_atac_filter.R"] +baseCommand: ["Rscript"] +arguments: +- valueFrom: $(inputs.export_html_report?["/usr/local/bin/sc_report_wrapper.R", "/usr/local/bin/sc_atac_filter.R"]:"/usr/local/bin/sc_atac_filter.R") + stdout: sc_atac_filter_stdout.log stderr: sc_atac_filter_stderr.log diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index 408a93b0..7782ac31 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 inputs: @@ -261,6 +261,14 @@ inputs: doc: | Export results to UCSC Cell Browser. Default: false + export_html_report: + type: boolean? + default: false + doc: | + Export tehcnical report. HTML format. + Note, stdout will be less informative. + Default: false + output_prefix: type: string? inputBinding: @@ -647,6 +655,14 @@ outputs: Seurat object. H5AD format. + sc_report_html_file: + type: File? + outputBinding: + glob: "sc_report.html" + doc: | + Tehcnical report. + HTML format. + stdout_log: type: stdout @@ -654,7 +670,10 @@ outputs: type: stderr -baseCommand: ["sc_atac_reduce.R"] +baseCommand: ["Rscript"] +arguments: +- valueFrom: $(inputs.export_html_report?["/usr/local/bin/sc_report_wrapper.R", "/usr/local/bin/sc_atac_reduce.R"]:"/usr/local/bin/sc_atac_reduce.R") + stdout: sc_atac_reduce_stdout.log stderr: sc_atac_reduce_stderr.log diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index bf28ca84..13354272 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 inputs: @@ -308,6 +308,14 @@ inputs: doc: | Export results to UCSC Cell Browser. Default: false + export_html_report: + type: boolean? + default: false + doc: | + Export tehcnical report. HTML format. + Note, stdout will be less informative. + Default: false + output_prefix: type: string? inputBinding: @@ -741,6 +749,14 @@ outputs: SCope compatible. Loom format. + sc_report_html_file: + type: File? + outputBinding: + glob: "sc_report.html" + doc: | + Tehcnical report. + HTML format. + stdout_log: type: stdout @@ -748,7 +764,10 @@ outputs: type: stderr -baseCommand: ["sc_ctype_assign.R"] +baseCommand: ["Rscript"] +arguments: +- valueFrom: $(inputs.export_html_report?["/usr/local/bin/sc_report_wrapper.R", "/usr/local/bin/sc_ctype_assign.R"]:"/usr/local/bin/sc_ctype_assign.R") + stdout: sc_ctype_assign_stdout.log stderr: sc_ctype_assign_stderr.log diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index f3dd4f74..4ddfedd8 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 inputs: @@ -418,6 +418,14 @@ inputs: doc: | Export results to UCSC Cell Browser. Default: false + export_html_report: + type: boolean? + default: false + doc: | + Export tehcnical report. HTML format. + Note, stdout will be less informative. + Default: false + output_prefix: type: string? inputBinding: @@ -1317,6 +1325,14 @@ outputs: RNA counts. Loupe format + sc_report_html_file: + type: File? + outputBinding: + glob: "sc_report.html" + doc: | + Tehcnical report. + HTML format. + stdout_log: type: stdout @@ -1324,7 +1340,9 @@ outputs: type: stderr -baseCommand: ["sc_multiome_filter.R"] +baseCommand: ["Rscript"] +arguments: +- valueFrom: $(inputs.export_html_report?["/usr/local/bin/sc_report_wrapper.R", "/usr/local/bin/sc_multiome_filter.R"]:"/usr/local/bin/sc_multiome_filter.R") stdout: sc_multiome_filter_stdout.log stderr: sc_multiome_filter_stderr.log diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index 8c01db74..3de79a12 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 inputs: @@ -227,6 +227,14 @@ inputs: doc: | Export results to UCSC Cell Browser. Default: false + export_html_report: + type: boolean? + default: false + doc: | + Export tehcnical report. HTML format. + Note, stdout will be less informative. + Default: false + output_prefix: type: string? inputBinding: @@ -663,6 +671,14 @@ outputs: SCope compatible. Loom format. + sc_report_html_file: + type: File? + outputBinding: + glob: "sc_report.html" + doc: | + Tehcnical report. + HTML format. + stdout_log: type: stdout @@ -670,7 +686,9 @@ outputs: type: stderr -baseCommand: ["sc_rna_cluster.R"] +baseCommand: ["Rscript"] +arguments: +- valueFrom: $(inputs.export_html_report?["/usr/local/bin/sc_report_wrapper.R", "/usr/local/bin/sc_rna_cluster.R"]:"/usr/local/bin/sc_rna_cluster.R") stdout: sc_rna_cluster_stdout.log stderr: sc_rna_cluster_stderr.log diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index 0ee481c1..fc2b5c0d 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 inputs: @@ -183,6 +183,14 @@ inputs: doc: | Export results to UCSC Cell Browser. Default: false + export_html_report: + type: boolean? + default: false + doc: | + Export tehcnical report. HTML format. + Note, stdout will be less informative. + Default: false + output_prefix: type: string? inputBinding: @@ -489,6 +497,14 @@ outputs: Seurat object. Loupe format + sc_report_html_file: + type: File? + outputBinding: + glob: "sc_report.html" + doc: | + Tehcnical report. + HTML format. + stdout_log: type: stdout @@ -496,7 +512,9 @@ outputs: type: stderr -baseCommand: ["sc_rna_da_cells.R"] +baseCommand: ["Rscript"] +arguments: +- valueFrom: $(inputs.export_html_report?["/usr/local/bin/sc_report_wrapper.R", "/usr/local/bin/sc_rna_da_cells.R"]:"/usr/local/bin/sc_rna_da_cells.R") stdout: sc_rna_da_cells_stdout.log stderr: sc_rna_da_cells_stderr.log diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index c04283f1..580b3d6a 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 inputs: @@ -299,6 +299,14 @@ inputs: Print debug information. Default: false + export_html_report: + type: boolean? + default: false + doc: | + Export tehcnical report. HTML format. + Note, stdout will be less informative. + Default: false + output_prefix: type: string? inputBinding: @@ -646,6 +654,14 @@ outputs: Filtered normalized reads counts per cell. GCT format. + sc_report_html_file: + type: File? + outputBinding: + glob: "sc_report.html" + doc: | + Tehcnical report. + HTML format. + stdout_log: type: stdout @@ -653,7 +669,9 @@ outputs: type: stderr -baseCommand: ["sc_rna_de_pseudobulk.R"] +baseCommand: ["Rscript"] +arguments: +- valueFrom: $(inputs.export_html_report?["/usr/local/bin/sc_report_wrapper.R", "/usr/local/bin/sc_rna_de_pseudobulk.R"]:"/usr/local/bin/sc_rna_de_pseudobulk.R") stdout: sc_rna_de_pseudobulk_stdout.log stderr: sc_rna_de_pseudobulk_stderr.log diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index 31f7c28d..3c083764 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 inputs: @@ -241,6 +241,14 @@ inputs: doc: | Export results to UCSC Cell Browser. Default: false + export_html_report: + type: boolean? + default: false + doc: | + Export tehcnical report. HTML format. + Note, stdout will be less informative. + Default: false + output_prefix: type: string? inputBinding: @@ -632,6 +640,14 @@ outputs: Seurat object. Loupe format + sc_report_html_file: + type: File? + outputBinding: + glob: "sc_report.html" + doc: | + Tehcnical report. + HTML format. + stdout_log: type: stdout @@ -639,7 +655,9 @@ outputs: type: stderr -baseCommand: ["sc_rna_filter.R"] +baseCommand: ["Rscript"] +arguments: +- valueFrom: $(inputs.export_html_report?["/usr/local/bin/sc_report_wrapper.R", "/usr/local/bin/sc_rna_filter.R"]:"/usr/local/bin/sc_rna_filter.R") stdout: sc_rna_filter_stdout.log stderr: sc_rna_filter_stderr.log diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index 913ac887..142461ef 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 inputs: @@ -353,6 +353,14 @@ inputs: 'sct' or 'sctglm'. Default: false + export_html_report: + type: boolean? + default: false + doc: | + Export tehcnical report. HTML format. + Note, stdout will be less informative. + Default: false + output_prefix: type: string? inputBinding: @@ -752,6 +760,14 @@ outputs: SCope compatible. Loom format + sc_report_html_file: + type: File? + outputBinding: + glob: "sc_report.html" + doc: | + Tehcnical report. + HTML format. + stdout_log: type: stdout @@ -759,7 +775,10 @@ outputs: type: stderr -baseCommand: ["sc_rna_reduce.R"] +baseCommand: ["Rscript"] +arguments: +- valueFrom: $(inputs.export_html_report?["/usr/local/bin/sc_report_wrapper.R", "/usr/local/bin/sc_rna_reduce.R"]:"/usr/local/bin/sc_rna_reduce.R") + stdout: sc_rna_reduce_stdout.log stderr: sc_rna_reduce_stderr.log diff --git a/tools/sc-rna-trajectory.cwl b/tools/sc-rna-trajectory.cwl index be8df3d6..dd9b6477 100644 --- a/tools/sc-rna-trajectory.cwl +++ b/tools/sc-rna-trajectory.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 inputs: @@ -157,6 +157,14 @@ inputs: doc: | Export results to UCSC Cell Browser. Default: false + export_html_report: + type: boolean? + default: false + doc: | + Export tehcnical report. HTML format. + Note, stdout will be less informative. + Default: false + output_prefix: type: string? inputBinding: @@ -625,6 +633,14 @@ outputs: Seurat object. Loupe format + sc_report_html_file: + type: File? + outputBinding: + glob: "sc_report.html" + doc: | + Tehcnical report. + HTML format. + stdout_log: type: stdout @@ -632,7 +648,9 @@ outputs: type: stderr -baseCommand: ["sc_rna_trajectory.R"] +baseCommand: ["Rscript"] +arguments: +- valueFrom: $(inputs.export_html_report?["/usr/local/bin/sc_report_wrapper.R", "/usr/local/bin/sc_rna_trajectory.R"]:"/usr/local/bin/sc_rna_trajectory.R") stdout: sc_rna_trajectory_stdout.log stderr: sc_rna_trajectory_stderr.log diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index e08ab3a8..fea8b1ac 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 inputs: @@ -123,6 +123,14 @@ inputs: doc: | Export results to UCSC Cell Browser. Default: false + export_html_report: + type: boolean? + default: false + doc: | + Export tehcnical report. HTML format. + Note, stdout will be less informative. + Default: false + output_prefix: type: string? inputBinding: @@ -374,6 +382,14 @@ outputs: RNA counts. Loupe format + sc_report_html_file: + type: File? + outputBinding: + glob: "sc_report.html" + doc: | + Tehcnical report. + HTML format. + stdout_log: type: stdout @@ -381,7 +397,10 @@ outputs: type: stderr -baseCommand: ["sc_triangulate.R"] +baseCommand: ["Rscript"] +arguments: +- valueFrom: $(inputs.export_html_report?["/usr/local/bin/sc_report_wrapper.R", "/usr/local/bin/sc_triangulate.R"]:"/usr/local/bin/sc_triangulate.R") + stdout: sc_triangulate_stdout.log stderr: sc_triangulate_stderr.log diff --git a/tools/sc-vdj-profile.cwl b/tools/sc-vdj-profile.cwl index 7dbdf1b7..1308a99b 100644 --- a/tools/sc-vdj-profile.cwl +++ b/tools/sc-vdj-profile.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 inputs: @@ -196,6 +196,14 @@ inputs: doc: | Export results to UCSC Cell Browser. Default: false + export_html_report: + type: boolean? + default: false + doc: | + Export tehcnical report. HTML format. + Note, stdout will be less informative. + Default: false + output_prefix: type: string? inputBinding: @@ -543,6 +551,14 @@ outputs: SCope compatible. Loom format. + sc_report_html_file: + type: File? + outputBinding: + glob: "sc_report.html" + doc: | + Tehcnical report. + HTML format. + stdout_log: type: stdout @@ -550,7 +566,10 @@ outputs: type: stderr -baseCommand: ["sc_vdj_profile.R"] +baseCommand: ["Rscript"] +arguments: +- valueFrom: $(inputs.export_html_report?["/usr/local/bin/sc_report_wrapper.R", "/usr/local/bin/sc_vdj_profile.R"]:"/usr/local/bin/sc_vdj_profile.R") + stdout: sc_vdj_profile_stdout.log stderr: sc_vdj_profile_stderr.log diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index 9562059d..d363ed40 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.39 + dockerPull: biowardrobe2/sc-tools:v0.0.40 inputs: @@ -382,6 +382,14 @@ inputs: doc: | Export results to UCSC Cell Browser. Default: false + export_html_report: + type: boolean? + default: false + doc: | + Export tehcnical report. HTML format. + Note, stdout will be less informative. + Default: false + output_prefix: type: string? inputBinding: @@ -890,6 +898,14 @@ outputs: SCope compatible. Loom format + sc_report_html_file: + type: File? + outputBinding: + glob: "sc_report.html" + doc: | + Tehcnical report. + HTML format. + stdout_log: type: stdout @@ -897,7 +913,10 @@ outputs: type: stderr -baseCommand: ["sc_wnn_cluster.R"] +baseCommand: ["Rscript"] +arguments: +- valueFrom: $(inputs.export_html_report?["/usr/local/bin/sc_report_wrapper.R", "/usr/local/bin/sc_wnn_cluster.R"]:"/usr/local/bin/sc_wnn_cluster.R") + stdout: sc_wnn_cluster_stdout.log stderr: sc_wnn_cluster_stderr.log diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index 4ea595dd..f83e1776 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -128,6 +128,16 @@ inputs: not provided. Default: None + export_html_report: + type: boolean? + default: true + label: "Show HTML report" + doc: | + Export tehcnical report in HTML format. + Default: true + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -404,6 +414,18 @@ outputs: doc: | Compressed folder with all PDF plots. + sc_report_html_file: + type: File? + outputSource: sc_atac_cluster/sc_report_html_file + label: "Analysis log" + doc: | + Tehcnical report. + HTML format. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + sc_atac_cluster_stdout_log: type: File outputSource: sc_atac_cluster/stdout_log @@ -456,6 +478,7 @@ steps: default: 32 vector_memory_limit: default: 128 + export_html_report: export_html_report threads: source: threads valueFrom: $(parseInt(self)) @@ -476,6 +499,7 @@ steps: - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds + - sc_report_html_file - stdout_log - stderr_log diff --git a/workflows/sc-atac-coverage.cwl b/workflows/sc-atac-coverage.cwl index a2370049..ec3b70f3 100644 --- a/workflows/sc-atac-coverage.cwl +++ b/workflows/sc-atac-coverage.cwl @@ -116,6 +116,16 @@ inputs: "sd:layout": advanced: true + export_html_report: + type: boolean? + default: true + label: "Show HTML report" + doc: | + Export tehcnical report in HTML format. + Default: true + "sd:layout": + advanced: true + threads: type: - "null" @@ -192,6 +202,18 @@ outputs: name: "ATAC fragments coverage" height: 120 + sc_report_html_file: + type: File? + outputSource: sc_atac_coverage/sc_report_html_file + label: "Analysis log" + doc: | + Tehcnical report. + HTML format. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + experiment_info: type: File label: "IGV tracks order" @@ -236,6 +258,7 @@ steps: default: 32 vector_memory_limit: default: 128 + export_html_report: export_html_report threads: source: threads valueFrom: $(parseInt(self)) @@ -243,6 +266,7 @@ steps: - peaks_bigbed_file - cut_sites_bigwig_file - fragments_bigwig_file + - sc_report_html_file - stdout_log - stderr_log diff --git a/workflows/sc-atac-dbinding.cwl b/workflows/sc-atac-dbinding.cwl index 2ad3d97f..b2944f65 100644 --- a/workflows/sc-atac-dbinding.cwl +++ b/workflows/sc-atac-dbinding.cwl @@ -251,6 +251,16 @@ inputs: "sd:layout": advanced: true + export_html_report: + type: boolean? + default: true + label: "Show HTML report" + doc: | + Export tehcnical report in HTML format. + Default: true + "sd:layout": + advanced: true + threads: type: - "null" @@ -614,6 +624,18 @@ outputs: doc: | Compressed folder with all PDF plots. + sc_report_html_file: + type: File? + outputSource: sc_atac_dbinding/sc_report_html_file + label: "Analysis log" + doc: | + Tehcnical report. + HTML format. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + sc_atac_dbinding_stdout_log: type: File outputSource: sc_atac_dbinding/stdout_log @@ -668,6 +690,7 @@ steps: default: 32 vector_memory_limit: default: 128 + export_html_report: export_html_report threads: source: threads valueFrom: $(parseInt(self)) @@ -696,6 +719,7 @@ steps: - umap_rd_atacumap_plot_pdf - umap_rd_wnnumap_plot_pdf - dbnd_vlcn_plot_pdf + - sc_report_html_file - stdout_log - stderr_log diff --git a/workflows/sc-atac-filter.cwl b/workflows/sc-atac-filter.cwl index ebc43d6f..2271e660 100644 --- a/workflows/sc-atac-filter.cwl +++ b/workflows/sc-atac-filter.cwl @@ -281,6 +281,16 @@ inputs: "sd:layout": advanced: true + export_html_report: + type: boolean? + default: true + label: "Show HTML report" + doc: | + Export tehcnical report in HTML format. + Default: true + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -746,6 +756,18 @@ outputs: doc: | Compressed folder with all PDF plots. + sc_report_html_file: + type: File? + outputSource: sc_atac_filter/sc_report_html_file + label: "Analysis log" + doc: | + Tehcnical report. + HTML format. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + sc_atac_filter_stdout_log: type: File outputSource: sc_atac_filter/stdout_log @@ -821,6 +843,7 @@ steps: default: 32 vector_memory_limit: default: 128 + export_html_report: export_html_report threads: source: threads valueFrom: $(parseInt(self)) @@ -858,6 +881,7 @@ steps: - ucsc_cb_html_file - seurat_data_rds - datasets_metadata + - sc_report_html_file - stdout_log - stderr_log diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index fb880881..8a83e276 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -182,6 +182,16 @@ inputs: "sd:layout": advanced: true + export_html_report: + type: boolean? + default: true + label: "Show HTML report" + doc: | + Export tehcnical report in HTML format. + Default: true + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -462,6 +472,18 @@ outputs: doc: | Compressed folder with all PDF plots. + sc_report_html_file: + type: File? + outputSource: sc_atac_reduce/sc_report_html_file + label: "Analysis log" + doc: | + Tehcnical report. + HTML format. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + sc_atac_reduce_stdout_log: type: File outputSource: sc_atac_reduce/stdout_log @@ -516,6 +538,7 @@ steps: default: 32 vector_memory_limit: default: 128 + export_html_report: export_html_report threads: source: threads valueFrom: $(parseInt(self)) @@ -557,6 +580,7 @@ steps: - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds + - sc_report_html_file - stdout_log - stderr_log diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index 72729360..39dd76e4 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -188,6 +188,16 @@ inputs: "sd:layout": advanced: true + export_html_report: + type: boolean? + default: true + label: "Show HTML report" + doc: | + Export tehcnical report in HTML format. + Default: true + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -666,6 +676,18 @@ outputs: doc: | Compressed folder with all PDF plots. + sc_report_html_file: + type: File? + outputSource: ctype_assign/sc_report_html_file + label: "Analysis log" + doc: | + Tehcnical report. + HTML format. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + ctype_assign_stdout_log: type: File outputSource: ctype_assign/stdout_log @@ -752,6 +774,7 @@ steps: default: 32 vector_memory_limit: default: 128 + export_html_report: export_html_report threads: source: threads valueFrom: $(parseInt(self)) @@ -790,6 +813,7 @@ steps: - seurat_data_rds - seurat_data_scope - seurat_rna_data_cloupe + - sc_report_html_file - stdout_log - stderr_log diff --git a/workflows/sc-multiome-filter.cwl b/workflows/sc-multiome-filter.cwl index 381ffd49..860a3ddd 100644 --- a/workflows/sc-multiome-filter.cwl +++ b/workflows/sc-multiome-filter.cwl @@ -436,6 +436,16 @@ inputs: "sd:layout": advanced: true + export_html_report: + type: boolean? + default: true + label: "Show HTML report" + doc: | + Export tehcnical report in HTML format. + Default: true + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -1256,6 +1266,18 @@ outputs: doc: | Compressed folder with all PDF plots. + sc_report_html_file: + type: File? + outputSource: sc_multiome_filter/sc_report_html_file + label: "Analysis log" + doc: | + Tehcnical report. + HTML format. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + sc_multiome_filter_stdout_log: type: File outputSource: sc_multiome_filter/stdout_log @@ -1368,6 +1390,7 @@ steps: default: 32 vector_memory_limit: default: 128 + export_html_report: export_html_report threads: source: threads valueFrom: $(parseInt(self)) @@ -1432,6 +1455,7 @@ steps: - seurat_data_rds - seurat_rna_data_cloupe - datasets_metadata + - sc_report_html_file - stdout_log - stderr_log diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index e1f74240..844f0f14 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -117,6 +117,16 @@ inputs: "sd:layout": advanced: true + export_html_report: + type: boolean? + default: true + label: "Show HTML report" + doc: | + Export tehcnical report in HTML format. + Default: true + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -578,6 +588,18 @@ outputs: doc: | Compressed folder with all PDF plots. + sc_report_html_file: + type: File? + outputSource: sc_rna_cluster/sc_report_html_file + label: "Analysis log" + doc: | + Tehcnical report. + HTML format. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + sc_rna_cluster_stdout_log: type: File outputSource: sc_rna_cluster/stdout_log @@ -633,6 +655,7 @@ steps: default: 32 vector_memory_limit: default: 128 + export_html_report: export_html_report threads: source: threads valueFrom: $(parseInt(self)) @@ -666,6 +689,7 @@ steps: - seurat_data_rds - seurat_data_cloupe - seurat_data_scope + - sc_report_html_file - stdout_log - stderr_log diff --git a/workflows/sc-rna-da-cells.cwl b/workflows/sc-rna-da-cells.cwl index 06d8682b..5c667aa6 100644 --- a/workflows/sc-rna-da-cells.cwl +++ b/workflows/sc-rna-da-cells.cwl @@ -116,6 +116,16 @@ inputs: "sd:layout": advanced: true + export_html_report: + type: boolean? + default: true + label: "Show HTML report" + doc: | + Export tehcnical report in HTML format. + Default: true + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -348,6 +358,18 @@ outputs: doc: | Compressed folder with all PDF plots. + sc_report_html_file: + type: File? + outputSource: da_cells/sc_report_html_file + label: "Analysis log" + doc: | + Tehcnical report. + HTML format. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + da_cells_stdout_log: type: File outputSource: da_cells/stdout_log @@ -392,6 +414,7 @@ steps: default: 32 vector_memory_limit: default: 128 + export_html_report: export_html_report threads: source: threads valueFrom: $(parseInt(self)) @@ -420,6 +443,7 @@ steps: - ucsc_cb_html_file - seurat_data_rds - seurat_data_cloupe + - sc_report_html_file - stdout_log - stderr_log diff --git a/workflows/sc-rna-de-pseudobulk.cwl b/workflows/sc-rna-de-pseudobulk.cwl index 678b1ea1..4cac69f9 100644 --- a/workflows/sc-rna-de-pseudobulk.cwl +++ b/workflows/sc-rna-de-pseudobulk.cwl @@ -257,6 +257,16 @@ inputs: RNA-Seq Datasets" and can be utilized in the current or future steps of analysis. + export_html_report: + type: boolean? + default: true + label: "Show HTML report" + doc: | + Export tehcnical report in HTML format. + Default: true + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -553,6 +563,18 @@ outputs: doc: | Compressed folder with all PDF plots. + sc_report_html_file: + type: File? + outputSource: de_pseudobulk/sc_report_html_file + label: "Analysis log" + doc: | + Tehcnical report. + HTML format. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + de_pseudobulk_stdout_log: type: File outputSource: de_pseudobulk/stdout_log @@ -648,6 +670,7 @@ steps: default: 32 vector_memory_limit: default: 128 + export_html_report: export_html_report threads: source: threads valueFrom: $(parseInt(self)) @@ -679,6 +702,7 @@ steps: - bulk_read_counts_gct - bulk_phenotypes_cls - cell_read_counts_gct + - sc_report_html_file - stdout_log - stderr_log diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index 4778af6d..2d9f356d 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -248,6 +248,16 @@ inputs: "sd:layout": advanced: true + export_html_report: + type: boolean? + default: true + label: "Show HTML report" + doc: | + Export tehcnical report in HTML format. + Default: true + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -739,6 +749,18 @@ outputs: doc: | Compressed folder with all PDF plots. + sc_report_html_file: + type: File? + outputSource: sc_rna_filter/sc_report_html_file + label: "Analysis log" + doc: | + Tehcnical report. + HTML format. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + sc_rna_filter_stdout_log: type: File outputSource: sc_rna_filter/stdout_log @@ -803,6 +825,7 @@ steps: default: 32 vector_memory_limit: default: 128 + export_html_report: export_html_report threads: source: threads valueFrom: $(parseInt(self)) @@ -843,6 +866,7 @@ steps: - seurat_data_rds - seurat_data_cloupe - datasets_metadata + - sc_report_html_file - stdout_log - stderr_log diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 654db337..543d1968 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -269,6 +269,16 @@ inputs: "sd:layout": advanced: true + export_html_report: + type: boolean? + default: true + label: "Show HTML report" + doc: | + Export tehcnical report in HTML format. + Default: true + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -557,6 +567,18 @@ outputs: doc: | Compressed folder with all PDF plots. + sc_report_html_file: + type: File? + outputSource: sc_rna_reduce/sc_report_html_file + label: "Analysis log" + doc: | + Tehcnical report. + HTML format. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + sc_rna_reduce_stdout_log: type: File outputSource: sc_rna_reduce/stdout_log @@ -638,6 +660,7 @@ steps: default: 32 vector_memory_limit: default: 128 + export_html_report: export_html_report threads: source: threads valueFrom: $(parseInt(self)) @@ -680,6 +703,7 @@ steps: - ucsc_cb_html_file - seurat_data_rds - seurat_data_cloupe + - sc_report_html_file - stdout_log - stderr_log diff --git a/workflows/sc-rna-trajectory.cwl b/workflows/sc-rna-trajectory.cwl index a3915566..6af422a4 100644 --- a/workflows/sc-rna-trajectory.cwl +++ b/workflows/sc-rna-trajectory.cwl @@ -131,6 +131,16 @@ inputs: "sd:layout": advanced: true + export_html_report: + type: boolean? + default: true + label: "Show HTML report" + doc: | + Export tehcnical report in HTML format. + Default: true + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -470,6 +480,18 @@ outputs: doc: | Compressed folder with all PDF plots. + sc_report_html_file: + type: File? + outputSource: rna_trajectory/sc_report_html_file + label: "Analysis log" + doc: | + Tehcnical report. + HTML format. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + rna_trajectory_stdout_log: type: File outputSource: rna_trajectory/stdout_log @@ -517,6 +539,7 @@ steps: default: 32 vector_memory_limit: default: 128 + export_html_report: export_html_report threads: source: threads valueFrom: $(parseInt(self)) @@ -569,6 +592,7 @@ steps: - ucsc_cb_html_file - seurat_data_rds - seurat_data_cloupe + - sc_report_html_file - stdout_log - stderr_log diff --git a/workflows/sc-triangulate.cwl b/workflows/sc-triangulate.cwl index ccee7346..923aac35 100644 --- a/workflows/sc-triangulate.cwl +++ b/workflows/sc-triangulate.cwl @@ -92,6 +92,16 @@ inputs: "sd:layout": advanced: true + export_html_report: + type: boolean? + default: true + label: "Show HTML report" + doc: | + Export tehcnical report in HTML format. + Default: true + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -288,6 +298,18 @@ outputs: doc: | Compressed folder with all PDF plots. + sc_report_html_file: + type: File? + outputSource: triangulate/sc_report_html_file + label: "Analysis log" + doc: | + Tehcnical report. + HTML format. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + triangulate_stdout_log: type: File outputSource: triangulate/stdout_log @@ -325,6 +347,7 @@ steps: default: 32 vector_memory_limit: default: 128 + export_html_report: export_html_report threads: source: threads valueFrom: $(parseInt(self)) @@ -351,6 +374,7 @@ steps: - ucsc_cb_html_file - seurat_data_rds - seurat_rna_data_cloupe + - sc_report_html_file - stdout_log - stderr_log diff --git a/workflows/sc-vdj-profile.cwl b/workflows/sc-vdj-profile.cwl index 781f7153..b34ad630 100644 --- a/workflows/sc-vdj-profile.cwl +++ b/workflows/sc-vdj-profile.cwl @@ -158,6 +158,16 @@ inputs: "sd:layout": advanced: true + export_html_report: + type: boolean? + default: true + label: "Show HTML report" + doc: | + Export tehcnical report in HTML format. + Default: true + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -190,7 +200,7 @@ inputs: - "3" - "4" - "5" - - "4" + - "6" default: "4" label: "Cores/CPUs" doc: | @@ -577,6 +587,18 @@ outputs: doc: | Compressed folder with all PDF plots. + sc_report_html_file: + type: File? + outputSource: vdj_profile/sc_report_html_file + label: "Analysis log" + doc: | + Tehcnical report. + HTML format. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + vdj_profile_stdout_log: type: File outputSource: vdj_profile/stdout_log @@ -621,6 +643,7 @@ steps: default: 32 vector_memory_limit: default: 128 + export_html_report: export_html_report threads: source: threads valueFrom: $(parseInt(self)) @@ -654,6 +677,7 @@ steps: - seurat_data_rds - seurat_data_cloupe - seurat_data_scope + - sc_report_html_file - stdout_log - stderr_log diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index ae3353aa..f1e81c81 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -166,6 +166,16 @@ inputs: "sd:layout": advanced: true + export_html_report: + type: boolean? + default: true + label: "Show HTML report" + doc: | + Export tehcnical report in HTML format. + Default: true + "sd:layout": + advanced: true + color_theme: type: - "null" @@ -707,6 +717,18 @@ outputs: doc: | Compressed folder with all PDF plots. + sc_report_html_file: + type: File? + outputSource: sc_wnn_cluster/sc_report_html_file + label: "Analysis log" + doc: | + Tehcnical report. + HTML format. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + sc_wnn_cluster_stdout_log: type: File outputSource: sc_wnn_cluster/stdout_log @@ -770,6 +792,7 @@ steps: default: 32 vector_memory_limit: default: 128 + export_html_report: export_html_report threads: source: threads valueFrom: $(parseInt(self)) @@ -808,6 +831,7 @@ steps: - seurat_data_rds - seurat_rna_data_cloupe - seurat_data_scope + - sc_report_html_file - stdout_log - stderr_log From e02ae0e31e36ed56e2d475a0d77bdead58ed8b38 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 21 Aug 2024 17:35:50 -0400 Subject: [PATCH 152/162] Make all sc cluster workflows to support resolution ranges --- workflows/sc-atac-cluster.cwl | 34 ++++++++++++++++++++++++++++++---- workflows/sc-rna-cluster.cwl | 33 +++++++++++++++++++++++++++++---- workflows/sc-wnn-cluster.cwl | 33 +++++++++++++++++++++++++++++---- 3 files changed, 88 insertions(+), 12 deletions(-) diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index f83e1776..9240a4f1 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -19,6 +19,26 @@ requirements: let splitted_line = line?line.split(/[\s,]+/).map(parseFloat):null; return (splitted_line && !!splitted_line.length)?splitted_line:null; }; + - var parse_range = function(line) { + if (line.includes("-")) { + const parts = line.split("-"); + const start = parseFloat(parts[0].trim()); + let end, step; + if (parts[1].includes(":")) { + [end, step] = parts[1].split(":").map(Number); + } else { + end = parseFloat(parts[1].trim()); + step = 0.1; + } + const result = []; + for (let i = start; i <= end; i = parseFloat((i + step).toFixed(10))) { + result.push(parseFloat(i.toFixed(10))); + } + return result; + } else { + return [parseFloat(line)]; + } + }; "sd:upstream": @@ -85,15 +105,19 @@ inputs: Default: 40 resolution: - type: float? - default: 0.3 + type: string? + default: "0.3" label: "Clustering resolution" doc: | Resolution to define the "granularity" of the clustered data. Larger values lead to a bigger number of clusters. Optimal resolution often increases - with the number of cells. + with the number of cells. To run the + analysis with multiple resolutions, + provide a range in a form of + start-end:step. Step parameter is + optional and equal to 0.1 by default. Default: 0.3 identify_diff_peaks: @@ -455,7 +479,9 @@ steps: default: euclidean cluster_algorithm: default: "slm" - resolution: resolution + resolution: + source: resolution + valueFrom: $(parse_range(self)) atac_fragments_file: atac_fragments_file genes_of_interest: source: genes_of_interest diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index 844f0f14..d248d789 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -19,6 +19,26 @@ requirements: let splitted_line = line?line.split(/[\s,]+/).map(parseFloat):null; return (splitted_line && !!splitted_line.length)?splitted_line:null; }; + - var parse_range = function(line) { + if (line.includes("-")) { + const parts = line.split("-"); + const start = parseFloat(parts[0].trim()); + let end, step; + if (parts[1].includes(":")) { + [end, step] = parts[1].split(":").map(Number); + } else { + end = parseFloat(parts[1].trim()); + step = 0.1; + } + const result = []; + for (let i = start; i <= end; i = parseFloat((i + step).toFixed(10))) { + result.push(parseFloat(i.toFixed(10))); + } + return result; + } else { + return [parseFloat(line)]; + } + }; "sd:upstream": @@ -65,8 +85,8 @@ inputs: Default: 40 resolution: - type: float? - default: 0.3 + type: string? + default: "0.3" label: "Clustering resolution" doc: | The resolution defines the “granularity” @@ -75,7 +95,10 @@ inputs: resolution often increases with the number of cells. For a dataset of 3,000 cells, a value within the 0.3-1.2 range usually - returns good results. + returns good results. To run the analysis + with multiple resolutions, provide a range + in a form of start-end:step. Step parameter + is optional and equal to 0.1 by default. Default: 0.3 identify_diff_genes: @@ -628,7 +651,9 @@ steps: default: euclidean cluster_algorithm: default: "louvain" - resolution: resolution + resolution: + source: resolution + valueFrom: $(parse_range(self)) genes_of_interest: source: genes_of_interest valueFrom: $(split_features(self)) diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index f1e81c81..a0ecfa49 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -19,6 +19,26 @@ requirements: let splitted_line = line?line.split(/[\s,]+/).map(parseFloat):null; return (splitted_line && !!splitted_line.length)?splitted_line:null; }; + - var parse_range = function(line) { + if (line.includes("-")) { + const parts = line.split("-"); + const start = parseFloat(parts[0].trim()); + let end, step; + if (parts[1].includes(":")) { + [end, step] = parts[1].split(":").map(Number); + } else { + end = parseFloat(parts[1].trim()); + step = 0.1; + } + const result = []; + for (let i = start; i <= end; i = parseFloat((i + step).toFixed(10))) { + result.push(parseFloat(i.toFixed(10))); + } + return result; + } else { + return [parseFloat(line)]; + } + }; "sd:upstream": @@ -96,15 +116,18 @@ inputs: Default: 40 resolution: - type: float? - default: 0.3 + type: string? + default: "0.3" label: "Clustering resolution" doc: | The resolution defines the “granularity” of the clustered data. Larger resolution values lead to more clusters. The optimal resolution often increases with the number - of cells. + of cells. To run the analysis with multiple + resolutions, provide a range in a form of + start-end:step. Step parameter is optional + and equal to 0.1 by default. Default: 0.3 identify_diff_genes: @@ -757,7 +780,9 @@ steps: atac_dimensions: atac_dimensions cluster_algorithm: default: "slm" - resolution: resolution + resolution: + source: resolution + valueFrom: $(parse_range(self)) atac_fragments_file: atac_fragments_file genes_of_interest: source: genes_of_interest From 73dff6ebadb3327cce3b756ca4bfd09254d0d939 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 3 Sep 2024 10:54:47 -0400 Subject: [PATCH 153/162] Update sc tools dockerfile to v0.0.41 --- tools/cellbrowser-build-altanalyze.cwl | 2 +- tools/cellbrowser-build-cellranger-arc.cwl | 2 +- tools/cellbrowser-build-cellranger-atac.cwl | 2 +- tools/cellbrowser-build-cellranger.cwl | 2 +- tools/sc-atac-cluster.cwl | 2 +- tools/sc-atac-coverage.cwl | 2 +- tools/sc-atac-dbinding.cwl | 2 +- tools/sc-atac-filter.cwl | 2 +- tools/sc-atac-reduce.cwl | 2 +- tools/sc-ctype-assign.cwl | 2 +- tools/sc-multiome-filter.cwl | 2 +- tools/sc-rna-cluster.cwl | 2 +- tools/sc-rna-da-cells.cwl | 2 +- tools/sc-rna-de-pseudobulk.cwl | 2 +- tools/sc-rna-filter.cwl | 2 +- tools/sc-rna-reduce.cwl | 2 +- tools/sc-rna-trajectory.cwl | 2 +- tools/sc-triangulate.cwl | 2 +- tools/sc-vdj-profile.cwl | 2 +- tools/sc-wnn-cluster.cwl | 2 +- 20 files changed, 20 insertions(+), 20 deletions(-) diff --git a/tools/cellbrowser-build-altanalyze.cwl b/tools/cellbrowser-build-altanalyze.cwl index f92d46c6..41da031a 100644 --- a/tools/cellbrowser-build-altanalyze.cwl +++ b/tools/cellbrowser-build-altanalyze.cwl @@ -4,7 +4,7 @@ class: CommandLineTool hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 requirements: diff --git a/tools/cellbrowser-build-cellranger-arc.cwl b/tools/cellbrowser-build-cellranger-arc.cwl index e0f7f60f..84d18bbd 100644 --- a/tools/cellbrowser-build-cellranger-arc.cwl +++ b/tools/cellbrowser-build-cellranger-arc.cwl @@ -4,7 +4,7 @@ class: CommandLineTool hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 requirements: diff --git a/tools/cellbrowser-build-cellranger-atac.cwl b/tools/cellbrowser-build-cellranger-atac.cwl index 192e4b36..b6d870ee 100644 --- a/tools/cellbrowser-build-cellranger-atac.cwl +++ b/tools/cellbrowser-build-cellranger-atac.cwl @@ -4,7 +4,7 @@ class: CommandLineTool hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 requirements: diff --git a/tools/cellbrowser-build-cellranger.cwl b/tools/cellbrowser-build-cellranger.cwl index 0abc4e93..4a08548b 100644 --- a/tools/cellbrowser-build-cellranger.cwl +++ b/tools/cellbrowser-build-cellranger.cwl @@ -4,7 +4,7 @@ class: CommandLineTool hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 requirements: diff --git a/tools/sc-atac-cluster.cwl b/tools/sc-atac-cluster.cwl index 3321195a..d3abc59c 100644 --- a/tools/sc-atac-cluster.cwl +++ b/tools/sc-atac-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 inputs: diff --git a/tools/sc-atac-coverage.cwl b/tools/sc-atac-coverage.cwl index 1fb26969..fe3053f8 100644 --- a/tools/sc-atac-coverage.cwl +++ b/tools/sc-atac-coverage.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 inputs: diff --git a/tools/sc-atac-dbinding.cwl b/tools/sc-atac-dbinding.cwl index 41c03c23..e1818e17 100644 --- a/tools/sc-atac-dbinding.cwl +++ b/tools/sc-atac-dbinding.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 inputs: diff --git a/tools/sc-atac-filter.cwl b/tools/sc-atac-filter.cwl index 9144ffc8..02a75683 100644 --- a/tools/sc-atac-filter.cwl +++ b/tools/sc-atac-filter.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 inputs: diff --git a/tools/sc-atac-reduce.cwl b/tools/sc-atac-reduce.cwl index 7782ac31..ed1cc6a2 100644 --- a/tools/sc-atac-reduce.cwl +++ b/tools/sc-atac-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 inputs: diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index 13354272..6c58b24f 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 inputs: diff --git a/tools/sc-multiome-filter.cwl b/tools/sc-multiome-filter.cwl index 4ddfedd8..3f0501b8 100644 --- a/tools/sc-multiome-filter.cwl +++ b/tools/sc-multiome-filter.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 inputs: diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index 3de79a12..1dee0a24 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 inputs: diff --git a/tools/sc-rna-da-cells.cwl b/tools/sc-rna-da-cells.cwl index fc2b5c0d..acf04418 100644 --- a/tools/sc-rna-da-cells.cwl +++ b/tools/sc-rna-da-cells.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 inputs: diff --git a/tools/sc-rna-de-pseudobulk.cwl b/tools/sc-rna-de-pseudobulk.cwl index 580b3d6a..acc08299 100644 --- a/tools/sc-rna-de-pseudobulk.cwl +++ b/tools/sc-rna-de-pseudobulk.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 inputs: diff --git a/tools/sc-rna-filter.cwl b/tools/sc-rna-filter.cwl index 3c083764..a8c63d08 100644 --- a/tools/sc-rna-filter.cwl +++ b/tools/sc-rna-filter.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 inputs: diff --git a/tools/sc-rna-reduce.cwl b/tools/sc-rna-reduce.cwl index 142461ef..53a81307 100644 --- a/tools/sc-rna-reduce.cwl +++ b/tools/sc-rna-reduce.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 inputs: diff --git a/tools/sc-rna-trajectory.cwl b/tools/sc-rna-trajectory.cwl index dd9b6477..0a2ac121 100644 --- a/tools/sc-rna-trajectory.cwl +++ b/tools/sc-rna-trajectory.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 inputs: diff --git a/tools/sc-triangulate.cwl b/tools/sc-triangulate.cwl index fea8b1ac..6c18cf28 100644 --- a/tools/sc-triangulate.cwl +++ b/tools/sc-triangulate.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 inputs: diff --git a/tools/sc-vdj-profile.cwl b/tools/sc-vdj-profile.cwl index 1308a99b..29b80717 100644 --- a/tools/sc-vdj-profile.cwl +++ b/tools/sc-vdj-profile.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 inputs: diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index d363ed40..1f3c25c5 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -11,7 +11,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/sc-tools:v0.0.40 + dockerPull: biowardrobe2/sc-tools:v0.0.41 inputs: From 7e771fe9cb20164ff4fae29c63e65f014e883c9f Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 9 Sep 2024 11:52:04 -0400 Subject: [PATCH 154/162] Add sc-rna-azimuth pipeline --- tools/sc-rna-azimuth.cwl | 966 +++++++++++++++++++++++++++++ workflows/sc-atac-cluster.cwl | 1 + workflows/sc-atac-coverage.cwl | 1 + workflows/sc-atac-dbinding.cwl | 1 + workflows/sc-atac-reduce.cwl | 1 + workflows/sc-ctype-assign.cwl | 1 + workflows/sc-rna-azimuth.cwl | 859 +++++++++++++++++++++++++ workflows/sc-rna-cluster.cwl | 1 + workflows/sc-rna-da-cells.cwl | 2 +- workflows/sc-rna-de-pseudobulk.cwl | 1 + workflows/sc-rna-reduce.cwl | 1 + workflows/sc-rna-trajectory.cwl | 1 + workflows/sc-triangulate.cwl | 1 + workflows/sc-vdj-profile.cwl | 1 + workflows/sc-wnn-cluster.cwl | 1 + 15 files changed, 1838 insertions(+), 1 deletion(-) create mode 100644 tools/sc-rna-azimuth.cwl create mode 100644 workflows/sc-rna-azimuth.cwl diff --git a/tools/sc-rna-azimuth.cwl b/tools/sc-rna-azimuth.cwl new file mode 100644 index 00000000..280b6ea8 --- /dev/null +++ b/tools/sc-rna-azimuth.cwl @@ -0,0 +1,966 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement +- class: EnvVarRequirement + envDef: + R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/sc-tools:v0.0.41 + + +inputs: + + query_data_rds: + type: File + inputBinding: + prefix: "--query" + doc: | + Path to the RDS file to load the query Seurat + object from. This file should include genes + expression information stored in the RNA assay + and, optionally, chromatin accessibility + information stored in the ATAC assay. The later + is used only for plots. + + reference_data_rds: + type: File + inputBinding: + prefix: "--reference" + doc: | + Path to the RDS file to load the reference Seurat + object from. This file can be downloaded as + ref.Rds from the + https://azimuth.hubmapconsortium.org/references/ + + reference_data_index: + type: File + inputBinding: + prefix: "--annoyidx" + doc: | + Path to the annoy index file generated for the + provided reference Seurat object. This file can + be downloaded as idx.annoy from the + https://azimuth.hubmapconsortium.org/references/ + + reference_source_column: + type: string + inputBinding: + prefix: "--source" + doc: | + Column from the metadata of the reference Seurat + object to select the reference annotations. + + identify_diff_genes: + type: boolean? + inputBinding: + prefix: "--diffgenes" + doc: | + Identify differentially expressed genes (putative + gene markers) for the predicted cell types. + Default: false + + identify_diff_peaks: + type: boolean? + inputBinding: + prefix: "--diffpeaks" + doc: | + Identify differentially accessible peaks for the + predicted cell types. Ignored if the query Seurat + object doesn't include chromatin accessibility + information stored in the ATAC assay. + Default: false + + rna_minimum_logfc: + type: float? + inputBinding: + prefix: "--rnalogfc" + doc: | + For putative gene markers identification include only + those genes that on average have a log fold change + difference in the expression between every tested + pair of the predicted cell types not lower than this + value. Ignored if '--diffgenes' is not set. + Default: 0.25 + + rna_minimum_pct: + type: float? + inputBinding: + prefix: "--rnaminpct" + doc: | + For putative gene markers identification include only + those genes that are detected in not lower than this + fraction of cells in either of the two tested predicted + cell types. Ignored if '--diffgenes' is not set. + Default: 0.1 + + only_positive_diff_genes: + type: boolean? + inputBinding: + prefix: "--rnaonlypos" + doc: | + For putative gene markers identification return only + upregulated markers. Ignored if '--diffgenes' is not + set. Default: false + + rna_test_to_use: + type: + - "null" + - type: enum + symbols: + - "wilcox" + - "bimod" + - "roc" + - "t" + - "negbinom" + - "poisson" + - "LR" + - "MAST" + - "DESeq2" + inputBinding: + prefix: "--rnatestuse" + doc: | + Statistical test to use for putative gene markers + identification. Ignored if '--diffgenes' is not set. + Default: wilcox + + atac_minimum_logfc: + type: float? + inputBinding: + prefix: "--ataclogfc" + doc: | + For differentially accessible peaks identification include + only those peaks that on average have a log fold change + difference in the chromatin accessibility between every + tested pair of the predicted cell types not lower than this + value. Ignored if '--diffpeaks' is not set or if the query + Seurat object doesn't include ATAC assay. + Default: 0.25 + + atac_minimum_pct: + type: float? + inputBinding: + prefix: "--atacminpct" + doc: | + For differentially accessible peaks identification include + only those peaks that are detected in not lower than this + fraction of cells in either of the two tested predicted + cell types. Ignored if '--diffpeaks' is not set or if the + query Seurat object doesn't include ATAC assay. + Default: 0.05 + + atac_test_to_use: + type: + - "null" + - type: enum + symbols: + - "wilcox" + - "bimod" + - "roc" + - "t" + - "negbinom" + - "poisson" + - "LR" + - "MAST" + - "DESeq2" + inputBinding: + prefix: "--atactestuse" + doc: | + Statistical test to use for differentially accessible peaks + identification. Ignored if '--diffpeaks' is not set or if + the query Seurat object doesn't include ATAC assay. + Default: LR + + atac_fragments_file: + type: File? + secondaryFiles: + - .tbi + inputBinding: + prefix: "--fragments" + doc: | + Count and barcode information for every ATAC fragment + used in the query Seurat object. File should be saved + in TSV format with a tbi-index file. Ignored if the + query Seurat object doesn't include ATAC assay. + + genes_of_interest: + type: + - "null" + - string + - string[] + inputBinding: + prefix: "--genes" + doc: | + Genes of interest to build gene expression and Tn5 + insertion frequency plots for the nearest peaks. To + build Tn5 insertion frequency plots for the nearest + peaks, the query Seurat object should include ATAC assay + and the --fragments file should be provided. + Default: None + + cvrg_upstream_bp: + type: int? + inputBinding: + prefix: "--upstream" + doc: | + Number of bases to extend the genome coverage region for + a specific gene upstream. Ignored if --genes or --fragments + parameters are not provided or when the query Seurat object + doesn't include ATAC assay. + Default: 2500 + + cvrg_downstream_bp: + type: int? + inputBinding: + prefix: "--downstream" + doc: | + Number of bases to extend the genome coverage region for + a specific gene downstream. Ignored if --genes or --fragments + parameters are not provided or when the query Seurat object + doesn't include ATAC assay. + Default: 2500 + + export_pdf_plots: + type: boolean? + inputBinding: + prefix: "--pdf" + doc: | + Export plots in PDF. + Default: false + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + inputBinding: + prefix: "--theme" + doc: | + Color theme for all generated plots. + Default: classic + + verbose: + type: boolean? + inputBinding: + prefix: "--verbose" + doc: | + Print debug information. + Default: false + + export_h5seurat_data: + type: boolean? + inputBinding: + prefix: "--h5seurat" + doc: | + Save Seurat data to h5seurat file. + Default: false + + export_h5ad_data: + type: boolean? + inputBinding: + prefix: "--h5ad" + doc: | + Save raw counts from the RNA and ATAC (if present) + assays to h5ad file(s). + Default: false + + export_loupe_data: + type: boolean? + inputBinding: + prefix: "--loupe" + doc: | + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. + Default: false + + export_scope_data: + type: boolean? + inputBinding: + prefix: "--scope" + doc: | + Save Seurat data to SCope compatible loom file. Only not + normalized raw counts from the RNA assay will be saved. + Default: false + + export_ucsc_cb: + type: boolean? + inputBinding: + prefix: "--cbbuild" + doc: | + Export results to UCSC Cell Browser. + Default: false + + export_html_report: + type: boolean? + default: false + doc: | + Export tehcnical report. HTML format. + Note, stdout will be less informative. + Default: false + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix. + Default: ./sc + + parallel_memory_limit: + type: int? + inputBinding: + prefix: "--memory" + doc: | + Maximum memory in GB allowed to be shared between + the workers when using multiple --cpus. + Default: 32 + + vector_memory_limit: + type: int? + default: 128 + doc: | + Maximum vector memory in GB allowed to be used by R. + Default: 128 + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. + Default: 1 + + seed: + type: int? + inputBinding: + prefix: "--seed" + doc: | + Seed number for random values. + Default: 42 + + +outputs: + + cell_cnts_gr_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_cell_cnts_gr_ctyp.png" + doc: | + Number of cells per cell type. + All cells. + PNG format. + + umap_qc_mtrcs_plot_png: + type: File? + outputBinding: + glob: "*_umap_qc_mtrcs.png" + doc: | + UMAP, QC metrics. + All cells. + PNG format. + + gene_umi_spl_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_gene_umi_spl_ctyp.png" + doc: | + Genes vs RNA reads per cell. + Split by cell type; all cells. + PNG format. + + umi_mito_spl_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_umi_mito_spl_ctyp.png" + doc: | + RNA reads vs mitochondrial % per cell. + Split by cell type; all cells. + PNG format. + + rnadbl_gr_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_rnadbl_gr_ctyp.png" + doc: | + Percentage of RNA doublets per cell type. + All cells. + PNG format. + + tss_frgm_spl_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_tss_frgm_spl_ctyp.png" + doc: | + TSS enrichment score vs ATAC + fragments in peaks per cell. + Split by cell type; all cells. + PNG format. + + atacdbl_gr_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_atacdbl_gr_ctyp.png" + doc: | + Percentage of ATAC doublets per cell type. + All cells. + PNG format. + + rna_atac_cnts_spl_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_rna_atac_cnts_spl_ctyp.png" + doc: | + RNA reads vs ATAC fragments in peaks per cell. + Split by cell type; all cells. + PNG format. + + vrlpdbl_gr_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_vrlpdbl_gr_ctyp.png" + doc: | + Percentage of RNA and ATAC doublets + per cell type. + All cells. + PNG format. + + qc_mtrcs_dnst_gr_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_qc_mtrcs_dnst_gr_ctyp.png" + doc: | + Distribution of QC metrics per cell + colored by cell type. + All cells. + PNG format. + + umap_gr_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_ctyp.png" + doc: | + UMAP colored by cell type. + All cells. + PNG format. + + umap_gr_ctyp_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_ctyp_spl_idnt.png" + doc: | + UMAP colored by cell type. + Split by dataset; downsampled to the + smallest dataset. + PNG format. + + cmp_gr_ctyp_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_cmp_gr_ctyp_spl_idnt.png" + doc: | + Composition plot colored by cell type. + Split by dataset; downsampled to the + smallest dataset. + PNG format. + + cmp_gr_idnt_spl_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_cmp_gr_idnt_spl_ctyp.png" + doc: | + Composition plot colored by dataset. + Split by cell type; downsampled to + the smallest dataset. + PNG format. + + umap_gr_ph_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_ph_spl_idnt.png" + doc: | + UMAP colored by cell cycle phase. + Split by dataset; downsampled to the + smallest dataset. + PNG format. + + cmp_gr_ph_spl_idnt_plot_png: + type: File? + outputBinding: + glob: "*_cmp_gr_ph_spl_idnt.png" + doc: | + Composition plot colored by cell cycle phase. + Split by dataset; downsampled to the smallest + dataset. + PNG format. + + umap_gr_ctyp_spl_ph_png: + type: File? + outputBinding: + glob: "*_umap_gr_ctyp_spl_ph.png" + doc: | + UMAP colored by cell type. + Split by cell cycle phase; downsampled + to the smallest dataset (if multiple + datasets are analyzed jointly). + PNG format. + + cmp_gr_ph_spl_ctyp_png: + type: File? + outputBinding: + glob: "*_cmp_gr_ph_spl_ctyp.png" + doc: | + Composition plot colored by cell cycle phase. + Split by cell type; downsampled to the + smallest dataset (if multiple datasets are + analyzed jointly). + PNG format. + + umap_gr_ctyp_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_ctyp_spl_cnd.png" + doc: | + UMAP colored by cell type. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + + cmp_gr_ctyp_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_cmp_gr_ctyp_spl_cnd.png" + doc: | + Composition plot colored by cell type. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + + cmp_gr_cnd_spl_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_cmp_gr_cnd_spl_ctyp.png" + doc: | + Composition plot colored by grouping condition. + Split by cell type; first downsampled to the + smallest dataset, then downsampled to the + smallest group. + PNG format. + + umap_gr_ph_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_umap_gr_ph_spl_cnd.png" + doc: | + UMAP colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + + cmp_gr_ph_spl_cnd_plot_png: + type: File? + outputBinding: + glob: "*_cmp_gr_ph_spl_cnd.png" + doc: | + Composition plot colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + + xpr_avg_plot_png: + type: File? + outputBinding: + glob: "*_xpr_avg.png" + doc: | + Average gene expression. + PNG format. + + xpr_per_cell_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_[!sgnl_]*.png" + doc: | + UMAP colored by gene expression. + All genes of interest. + PNG format. + + xpr_per_cell_sgnl_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_xpr_per_cell_sgnl_*.png" + doc: | + UMAP colored by gene expression density. + All genes of interest. + PNG format. + + xpr_dnst_plot_png: + type: File? + outputBinding: + glob: "*_xpr_dnst.png" + doc: | + Gene expression density. + PNG format. + + xpr_htmp_plot_png: + type: File? + outputBinding: + glob: "*_xpr_htmp.png" + doc: | + Gene expression heatmap. + Top gene markers. + PNG format. + + cvrg_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_cvrg_*.png" + doc: | + ATAC fragment coverage. + All genes of interest. + PNG format. + + all_plots_pdf: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*.pdf" + doc: | + All generated plots. + PDF format. + + xpr_htmp_tsv: + type: File? + outputBinding: + glob: "*_xpr_htmp.tsv" + doc: | + Gene expression heatmap. + Top gene markers. + TSV format. + + gene_markers_tsv: + type: File? + outputBinding: + glob: "*_gene_markers.tsv" + doc: | + Gene markers. + TSV format. + + peak_markers_tsv: + type: File? + outputBinding: + glob: "*_peak_markers.tsv" + doc: | + Peak markers. + TSV format. + + ucsc_cb_config_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser" + doc: | + UCSC Cell Browser configuration data. + + ucsc_cb_html_data: + type: Directory? + outputBinding: + glob: "*_cellbrowser/html_data" + doc: | + UCSC Cell Browser html data. + + ucsc_cb_html_file: + type: File? + outputBinding: + glob: "*_cellbrowser/html_data/index.html" + doc: | + UCSC Cell Browser html index. + + seurat_data_rds: + type: File + outputBinding: + glob: "*_data.rds" + doc: | + Seurat object. + RDS format. + + seurat_data_h5seurat: + type: File? + outputBinding: + glob: "*_data.h5seurat" + doc: | + Seurat object. + h5Seurat format. + + seurat_rna_data_h5ad: + type: File? + outputBinding: + glob: "*_rna_counts.h5ad" + doc: | + Seurat object. + RNA counts. + H5AD format. + + seurat_atac_data_h5ad: + type: File? + outputBinding: + glob: "*_atac_counts.h5ad" + doc: | + Seurat object. + ATAC counts. + H5AD format. + + seurat_rna_data_cloupe: + type: File? + outputBinding: + glob: "*_rna_counts.cloupe" + doc: | + Seurat object. + RNA counts. + Loupe format + + seurat_data_scope: + type: File? + outputBinding: + glob: "*_data.loom" + doc: | + Seurat object. + SCope compatible. + Loom format. + + sc_report_html_file: + type: File? + outputBinding: + glob: "sc_report.html" + doc: | + Tehcnical report. + HTML format. + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["Rscript"] +arguments: +- valueFrom: $(inputs.export_html_report?["/usr/local/bin/sc_report_wrapper.R", "/usr/local/bin/sc_rna_azimuth.R"]:"/usr/local/bin/sc_rna_azimuth.R") + + +stdout: sc_rna_azimuth_stdout.log +stderr: sc_rna_azimuth_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Single-Cell RNA-Seq Reference Mapping" +s:name: "Single-Cell RNA-Seq Reference Mapping" +s:alternateName: "Single-Cell RNA-Seq Reference Mapping" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-rna-azimuth.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-Cell RNA-Seq Reference Mapping + + Predicts cell types on the cell level based on + the reference annotation using Azimuth R package. + Reference models can be downloaded from the + https://azimuth.hubmapconsortium.org/ + + +s:about: | + usage: /tmp/sc_tools/sc_rna_azimuth.R [-h] --query QUERY --reference REFERENCE + --annoyidx ANNOYIDX --source SOURCE + [--diffgenes] [--diffpeaks] + [--rnalogfc RNALOGFC] + [--rnaminpct RNAMINPCT] [--rnaonlypos] + [--rnatestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] + [--ataclogfc ATACLOGFC] + [--atacminpct ATACMINPCT] + [--atactestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] + [--fragments FRAGMENTS] + [--genes [GENES [GENES ...]]] + [--upstream UPSTREAM] + [--downstream DOWNSTREAM] [--pdf] + [--verbose] [--h5seurat] [--h5ad] + [--loupe] [--cbbuild] [--scope] + [--tmpdir TMPDIR] [--output OUTPUT] + [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] + [--cpus CPUS] [--memory MEMORY] + [--seed SEED] + + Single-Cell RNA-Seq Reference Mapping + + optional arguments: + -h, --help show this help message and exit + --query QUERY Path to the RDS file to load the query Seurat object + from. This file should include genes expression + information stored in the RNA assay and, optionally, + chromatin accessibility information stored in the ATAC + assay. The later is used only for plots. + --reference REFERENCE + Path to the RDS file to load the reference Seurat + object from. This file can be downloaded as ref.Rds + from the + https://azimuth.hubmapconsortium.org/references/ + --annoyidx ANNOYIDX Path to the annoy index file generated for the + provided reference Seurat object. This file can be + downloaded as idx.annoy from the + https://azimuth.hubmapconsortium.org/references/ + --source SOURCE Column from the metadata of the reference Seurat + object to select the reference annotations. + --diffgenes Identify differentially expressed genes (putative gene + markers) for the predicted cell types. Default: false + --diffpeaks Identify differentially accessible peaks for the + predicted cell types. Ignored if the query Seurat + object doesn't include chromatin accessibility + information stored in the ATAC assay. Default: false + --rnalogfc RNALOGFC For putative gene markers identification include only + those genes that on average have a log fold change + difference in the expression between every tested pair + of the predicted cell types not lower than this value. + Ignored if '--diffgenes is not set. Default: 0.25 + --rnaminpct RNAMINPCT + For putative gene markers identification include only + those genes that are detected in not lower than this + fraction of cells in either of the two tested + predicted cell types. Ignored if '--diffgenes' is not + set. Default: 0.1 + --rnaonlypos For putative gene markers identification return only + upregulated markers. Ignored if '--diffgenes' is not + set. Default: false + --rnatestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2} + Statistical test to use for putative gene markers + identification. Ignored if '--diffgenes' is not set. + Default: wilcox + --ataclogfc ATACLOGFC + For differentially accessible peaks identification + include only those peaks that on average have a log + fold change difference in the chromatin accessibility + between every tested pair of the predicted cell types + not lower than this value. Ignored if '--diffpeaks is + not set or if the query Seurat object doesn't include + ATAC assay. Default: 0.25 + --atacminpct ATACMINPCT + For differentially accessible peaks identification + include only those peaks that are detected in not + lower than this fraction of cells in either of the two + tested predicted cell types. Ignored if '--diffpeaks' + is not set or if the query Seurat object doesn't + include ATAC assay. Default: 0.05 + --atactestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2} + Statistical test to use for differentially accessible + peaks identification. Ignored if '--diffpeaks' is not + set or if the query Seurat object doesn't include ATAC + assay. Default: LR + --fragments FRAGMENTS + Count and barcode information for every ATAC fragment + used in the query Seurat object. File should be saved + in TSV format with tbi-index file. Ignored if the + query Seurat object doesn't include ATAC assay. + --genes [GENES [GENES ...]] + Genes of interest to build gene expression and Tn5 + insertion frequency plots for the nearest peaks. To + build Tn5 insertion frequency plots for the nearest + peaks the query Seurat object should include ATAC + assay as well as the --fragments file should be + provided. Default: None + --upstream UPSTREAM Number of bases to extend the genome coverage region + for a specific gene upstream. Ignored if --genes or + --fragments parameters are not provided or when the + query Seurat object doesn't include ATAC assay. + Default: 2500 + --downstream DOWNSTREAM + Number of bases to extend the genome coverage region + for a specific gene downstream. Ignored if --genes or + --fragments parameters are not provided or when the + query Seurat object doesn't include ATAC assay. + Default: 2500 + --pdf Export plots in PDF. Default: false + --verbose Print debug information. Default: false + --h5seurat Save Seurat data to h5seurat file. Default: false + --h5ad Save raw counts from the RNA and ATAC (if present) + assays to h5ad file(s). Default: false + --loupe Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. + Default: false + --cbbuild Export results to UCSC Cell Browser. Default: false + --scope Save Seurat data to SCope compatible loom file. Only + not normalized raw counts from the RNA assay will be + saved. Default: false + --tmpdir TMPDIR Directory to keep temporary files. Default: either + /tmp or defined by the environment variables TMPDIR, + TMP, TEMP. + --output OUTPUT Output prefix. Default: ./sc + --theme {gray,bw,linedraw,light,dark,minimal,classic,void} + Color theme for all generated plots. Default: classic + --cpus CPUS Number of cores/cpus to use. Default: 1 + --memory MEMORY Maximum memory in GB allowed to be shared between the + workers when using multiple --cpus. Default: 32 + --seed SEED Seed number for random values. Default: 42 \ No newline at end of file diff --git a/workflows/sc-atac-cluster.cwl b/workflows/sc-atac-cluster.cwl index 9240a4f1..df4eca3b 100644 --- a/workflows/sc-atac-cluster.cwl +++ b/workflows/sc-atac-cluster.cwl @@ -47,6 +47,7 @@ requirements: - "sc-rna-cluster.cwl" - "sc-rna-reduce.cwl" - "sc-atac-reduce.cwl" + - "sc-rna-azimuth.cwl" sc_atac_sample: - "cellranger-arc-count.cwl" - "cellranger-arc-aggr.cwl" diff --git a/workflows/sc-atac-coverage.cwl b/workflows/sc-atac-coverage.cwl index ec3b70f3..feb44a6d 100644 --- a/workflows/sc-atac-coverage.cwl +++ b/workflows/sc-atac-coverage.cwl @@ -24,6 +24,7 @@ requirements: - "sc-atac-cluster.cwl" - "sc-wnn-cluster.cwl" - "sc-ctype-assign.cwl" + - "sc-rna-azimuth.cwl" sc_atac_sample: - "cellranger-arc-count.cwl" - "cellranger-arc-aggr.cwl" diff --git a/workflows/sc-atac-dbinding.cwl b/workflows/sc-atac-dbinding.cwl index b2944f65..8f207d36 100644 --- a/workflows/sc-atac-dbinding.cwl +++ b/workflows/sc-atac-dbinding.cwl @@ -29,6 +29,7 @@ requirements: - "sc-atac-cluster.cwl" - "sc-wnn-cluster.cwl" - "sc-ctype-assign.cwl" + - "sc-rna-azimuth.cwl" sc_atac_sample: - "cellranger-arc-count.cwl" - "cellranger-arc-aggr.cwl" diff --git a/workflows/sc-atac-reduce.cwl b/workflows/sc-atac-reduce.cwl index 8a83e276..72e13c5c 100644 --- a/workflows/sc-atac-reduce.cwl +++ b/workflows/sc-atac-reduce.cwl @@ -22,6 +22,7 @@ requirements: - "sc-rna-reduce.cwl" - "sc-atac-filter.cwl" - "sc-multiome-filter.cwl" + - "sc-rna-azimuth.cwl" inputs: diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index 39dd76e4..806c02f2 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -31,6 +31,7 @@ requirements: - "sc-rna-cluster.cwl" - "sc-atac-cluster.cwl" - "sc-wnn-cluster.cwl" + - "sc-rna-azimuth.cwl" sc_atac_sample: - "cellranger-arc-count.cwl" - "cellranger-arc-aggr.cwl" diff --git a/workflows/sc-rna-azimuth.cwl b/workflows/sc-rna-azimuth.cwl new file mode 100644 index 00000000..a2187a2d --- /dev/null +++ b/workflows/sc-rna-azimuth.cwl @@ -0,0 +1,859 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement + - class: InlineJavascriptRequirement + expressionLib: + - var split_features = function(line) { + function get_unique(value, index, self) { + return self.indexOf(value) === index && value != ""; + } + var splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; + return (splitted_line && !!splitted_line.length)?splitted_line:null; + }; + + +"sd:upstream": + sc_tools_sample: + - "sc-rna-filter.cwl" + - "sc-multiome-filter.cwl" + - "sc-rna-reduce.cwl" + - "sc-rna-cluster.cwl" + - "sc-wnn-cluster.cwl" + - "sc-atac-reduce.cwl" + - "sc-atac-cluster.cwl" + - "sc-ctype-assign.cwl" + sc_atac_sample: + - "cellranger-arc-count.cwl" + - "cellranger-arc-aggr.cwl" + + +inputs: + + alias: + type: string + label: "Analysis name" + sd:preview: + position: 1 + + query_data_rds: + type: File + label: "Single-cell Analysis with Filtered RNA-Seq Datasets" + doc: | + Analysis that includes filtered + single-cell data and was run through + "Single-Cell RNA-Seq Filtering Analysis" + or "Single-Cell Multiome ATAC-Seq and + RNA-Seq Filtering Analysis" at any of + the processing stages. + "sd:upstreamSource": "sc_tools_sample/seurat_data_rds" + "sd:localLabel": true + + atac_fragments_file: + type: File? + secondaryFiles: + - .tbi + label: "Cell Ranger RNA+ATAC Sample (optional)" + doc: | + Any "Cell Ranger RNA+ATAC Sample" for + generating ATAC fragments coverage plots + over the genes of interest. This sample + can be obtained from the "Cell Ranger + Count (RNA+ATAC)" or "Cell Ranger + Aggregate (RNA+ATAC)" pipelines. + "sd:upstreamSource": "sc_atac_sample/atac_fragments_file" + "sd:localLabel": true + + reference_source_column: + type: string + label: "Reference Seurat Object annotation column" + doc: | + Column from the metadata of the reference Seurat + object to select the reference annotations. + + identify_diff_genes: + type: boolean? + default: true + label: "Find gene markers" + doc: | + Identify upregulated genes in each + predicted cell type compared to all + other cells. Include only genes that + are expressed in at least 10% of the + cells coming from either current cell + type or from all other cell types + together. Exclude cells with + log2FoldChange values less than 0.25. + Use Wilcoxon Rank Sum test to + calculate P-values. Keep only genes + with P-values lower than 0.01. Adjust + P-values for multiple comparisons + using Bonferroni correction. + Default: true + + identify_diff_peaks: + type: boolean? + default: false + label: "Find peak markers" + doc: | + Identify differentially accessible + peaks in each predicted cell type + compared to all other cells. Include + only peaks that are present in at + least 5% of the cells coming from + either current cell type or from all + other cell types together. Exclude + cells with log2FoldChange values less + than 0.25. Use logistic regression + framework to calculate P-values. Keep + only peaks with P-values lower than + 0.01. Adjust P-values for multiple + comparisons using Bonferroni + correction. + Default: false + + genes_of_interest: + type: string? + default: null + label: "Genes of interest" + doc: | + Comma or space separated list of genes + of interest to visualize expression and + to generate ATAC fragments coverage plots. + Ignored if "Cell Ranger RNA+ATAC Sample + (optional)" input is not provided. + Default: None + + reference_data_rds: + type: File + label: "Reference Seurat Object (ref.Rds) file" + doc: | + RDS file to load the reference Seurat object from. + This file can be downloaded as ref.Rds from the + https://azimuth.hubmapconsortium.org/references/ + + reference_data_index: + type: File + label: "Reference Annoy Index (idx.annoy) file" + doc: | + Annoy index file for the provided reference RDS file. + This file can be downloaded as idx.annoy from the + https://azimuth.hubmapconsortium.org/references/ + + export_loupe_data: + type: boolean? + default: false + label: "Save raw counts to Loupe file. I confirm that data is generated by 10x technology and accept the EULA available at https://10xgen.com/EULA" + doc: | + Save raw counts from the RNA assay to Loupe file. By + enabling this feature you accept the End-User License + Agreement available at https://10xgen.com/EULA. + Default: false + "sd:layout": + advanced: true + + export_html_report: + type: boolean? + default: true + label: "Show HTML report" + doc: | + Export tehcnical report in HTML format. + Default: true + "sd:layout": + advanced: true + + color_theme: + type: + - "null" + - type: enum + symbols: + - "gray" + - "bw" + - "linedraw" + - "light" + - "dark" + - "minimal" + - "classic" + - "void" + default: "classic" + label: "Plots color theme" + doc: | + Color theme for all plots saved + as PNG files. + Default: classic + "sd:layout": + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + - "2" + - "3" + - "4" + - "5" + - "6" + default: "6" + label: "Cores/CPUs" + doc: | + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 6 + "sd:layout": + advanced: true + + +outputs: + + cell_cnts_gr_ctyp_plot_png: + type: File? + outputSource: rna_azimuth/cell_cnts_gr_ctyp_plot_png + label: "Number of cells per cell type (all cells)" + doc: | + Number of cells per cell type. + All cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Number of cells per cell type (all cells)" + + umap_qc_mtrcs_plot_png: + type: File? + outputSource: rna_azimuth/umap_qc_mtrcs_plot_png + label: "UMAP, QC metrics (all cells)" + doc: | + UMAP, QC metrics. + All cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "UMAP, QC metrics (all cells)" + + qc_mtrcs_dnst_gr_ctyp_plot_png: + type: File? + outputSource: rna_azimuth/qc_mtrcs_dnst_gr_ctyp_plot_png + label: "Distribution of QC metrics per cell colored by cell type (all cells)" + doc: | + Distribution of QC metrics per cell + colored by cell type. + All cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Distribution of QC metrics per cell colored by cell type (all cells)" + + gene_umi_spl_ctyp_plot_png: + type: File? + outputSource: rna_azimuth/gene_umi_spl_ctyp_plot_png + label: "Genes vs RNA reads per cell (split by cell type, all cells)" + doc: | + Genes vs RNA reads per cell. + Split by cell type; all cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Genes vs RNA reads per cell (split by cell type, all cells)" + + umi_mito_spl_ctyp_plot_png: + type: File? + outputSource: rna_azimuth/umi_mito_spl_ctyp_plot_png + label: "RNA reads vs mitochondrial % per cell (split by cell type, all cells)" + doc: | + RNA reads vs mitochondrial % per cell. + Split by cell type; all cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "RNA reads vs mitochondrial % per cell (split by cell type, all cells)" + + tss_frgm_spl_ctyp_plot_png: + type: File? + outputSource: rna_azimuth/tss_frgm_spl_ctyp_plot_png + label: "TSS enrichment score vs ATAC fragments in peaks per cell (split by cell type, all cells)" + doc: | + TSS enrichment score vs ATAC + fragments in peaks per cell. + Split by cell type; all cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "TSS enrichment score vs ATAC fragments in peaks per cell (split by cell type, all cells)" + + rna_atac_cnts_spl_ctyp_plot_png: + type: File? + outputSource: rna_azimuth/rna_atac_cnts_spl_ctyp_plot_png + label: "RNA reads vs ATAC fragments in peaks per cell (split by cell type, all cells)" + doc: | + RNA reads vs ATAC fragments in peaks per cell. + Split by cell type; all cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "RNA reads vs ATAC fragments in peaks per cell (split by cell type, all cells)" + + rnadbl_gr_ctyp_plot_png: + type: File? + outputSource: rna_azimuth/rnadbl_gr_ctyp_plot_png + label: "Percentage of RNA doublets per cell type (all cells)" + doc: | + Percentage of RNA doublets per cell type. + All cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Percentage of RNA doublets per cell type (all cells)" + + atacdbl_gr_ctyp_plot_png: + type: File? + outputSource: rna_azimuth/atacdbl_gr_ctyp_plot_png + label: "Percentage of ATAC doublets per cell type (all cells)" + doc: | + Percentage of ATAC doublets per cell type. + All cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Percentage of ATAC doublets per cell type (all cells)" + + vrlpdbl_gr_ctyp_plot_png: + type: File? + outputSource: rna_azimuth/vrlpdbl_gr_ctyp_plot_png + label: "Percentage of RNA and ATAC doublets per cell type (all cells)" + doc: | + Percentage of RNA and ATAC doublets + per cell type. + All cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Percentage of RNA and ATAC doublets per cell type (all cells)" + + umap_gr_ctyp_plot_png: + type: File? + outputSource: rna_azimuth/umap_gr_ctyp_plot_png + label: "UMAP colored by cell type (all cells)" + doc: | + UMAP colored by cell type. + All cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "Split by cell type" + Caption: "UMAP colored by cell type (all cells)" + + umap_gr_ctyp_spl_ph_png: + type: File? + outputSource: rna_azimuth/umap_gr_ctyp_spl_ph_png + label: "UMAP colored by cell type (split by cell cycle phase, optionally downsampled)" + doc: | + UMAP colored by cell type. + Split by cell cycle phase; downsampled + to the smallest dataset (if multiple + datasets are analyzed jointly). + PNG format. + "sd:visualPlugins": + - image: + tab: "Split by cell type" + Caption: "UMAP colored by cell type (split by cell cycle phase, optionally downsampled)" + + cmp_gr_ph_spl_ctyp_png: + type: File? + outputSource: rna_azimuth/cmp_gr_ph_spl_ctyp_png + label: "Composition plot colored by cell cycle phase (split by cell type, optionally downsampled)" + doc: | + Composition plot colored by cell cycle phase. + Split by cell type; downsampled to the + smallest dataset (if multiple datasets are + analyzed jointly). + PNG format. + "sd:visualPlugins": + - image: + tab: "Split by cell type" + Caption: "Composition plot colored by cell cycle phase (split by cell type, optionally downsampled)" + + umap_gr_ctyp_spl_idnt_plot_png: + type: File? + outputSource: rna_azimuth/umap_gr_ctyp_spl_idnt_plot_png + label: "UMAP colored by cell type (split by dataset, downsampled)" + doc: | + UMAP colored by cell type. + Split by dataset; downsampled to the + smallest dataset. + PNG format. + "sd:visualPlugins": + - image: + tab: "Split by dataset" + Caption: "UMAP colored by cell type (split by dataset, downsampled)" + + cmp_gr_ctyp_spl_idnt_plot_png: + type: File? + outputSource: rna_azimuth/cmp_gr_ctyp_spl_idnt_plot_png + label: "Composition plot colored by cell type (split by dataset, downsampled)" + doc: | + Composition plot colored by cell type. + Split by dataset; downsampled to the + smallest dataset. + PNG format. + "sd:visualPlugins": + - image: + tab: "Split by dataset" + Caption: "Composition plot colored by cell type (split by dataset, downsampled)" + + umap_gr_ph_spl_idnt_plot_png: + type: File? + outputSource: rna_azimuth/umap_gr_ph_spl_idnt_plot_png + label: "UMAP colored by cell cycle phase (split by dataset, downsampled)" + doc: | + UMAP colored by cell cycle phase. + Split by dataset; downsampled to the + smallest dataset. + PNG format. + "sd:visualPlugins": + - image: + tab: "Split by dataset" + Caption: "UMAP colored by cell cycle phase (split by dataset, downsampled)" + + cmp_gr_ph_spl_idnt_plot_png: + type: File? + outputSource: rna_azimuth/cmp_gr_ph_spl_idnt_plot_png + label: "Composition plot colored by cell cycle phase (split by dataset, downsampled)" + doc: | + Composition plot colored by cell cycle phase. + Split by dataset; downsampled to the smallest + dataset. + PNG format. + "sd:visualPlugins": + - image: + tab: "Split by dataset" + Caption: "Composition plot colored by cell cycle phase (split by dataset, downsampled)" + + umap_gr_ctyp_spl_cnd_plot_png: + type: File? + outputSource: rna_azimuth/umap_gr_ctyp_spl_cnd_plot_png + label: "UMAP colored by cell type (split by grouping condition, downsampled)" + doc: | + UMAP colored by cell type. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": + - image: + tab: "Split by group" + Caption: "UMAP colored by cell type (split by grouping condition, downsampled)" + + cmp_gr_ctyp_spl_cnd_plot_png: + type: File? + outputSource: rna_azimuth/cmp_gr_ctyp_spl_cnd_plot_png + label: "Composition plot colored by cell type (split by grouping condition, downsampled)" + doc: | + Composition plot colored by cell type. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": + - image: + tab: "Split by group" + Caption: "Composition plot colored by cell type (split by grouping condition, downsampled)" + + umap_gr_ph_spl_cnd_plot_png: + type: File? + outputSource: rna_azimuth/umap_gr_ph_spl_cnd_plot_png + label: "UMAP colored by cell cycle phase (split by grouping condition, downsampled)" + doc: | + UMAP colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": + - image: + tab: "Split by group" + Caption: "UMAP colored by cell cycle phase (split by grouping condition, downsampled)" + + cmp_gr_ph_spl_cnd_plot_png: + type: File? + outputSource: rna_azimuth/cmp_gr_ph_spl_cnd_plot_png + label: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" + doc: | + Composition plot colored by cell cycle phase. + Split by grouping condition; first downsampled + to the smallest dataset, then downsampled to + the smallest group. + PNG format. + "sd:visualPlugins": + - image: + tab: "Split by group" + Caption: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" + + xpr_avg_plot_png: + type: File? + outputSource: rna_azimuth/xpr_avg_plot_png + label: "Average gene expression" + doc: | + Average gene expression. + PNG format. + "sd:visualPlugins": + - image: + tab: "Genes of interest (expression)" + Caption: "Average gene expression" + + xpr_dnst_plot_png: + type: File? + outputSource: rna_azimuth/xpr_dnst_plot_png + label: "Gene expression density" + doc: | + Gene expression density. + PNG format. + "sd:visualPlugins": + - image: + tab: "Genes of interest (expression)" + Caption: "Gene expression density" + + xpr_per_cell_plot_png: + type: + - "null" + - type: array + items: File + outputSource: rna_azimuth/xpr_per_cell_plot_png + label: "UMAP colored by gene expression (per gene)" + doc: | + UMAP colored by gene expression. + All genes of interest. + PNG format. + "sd:visualPlugins": + - image: + tab: "Genes of interest (expression)" + Caption: "UMAP colored by gene expression (per gene)" + + cvrg_plot_png: + type: + - "null" + - type: array + items: File + outputSource: rna_azimuth/cvrg_plot_png + label: "ATAC fragment coverage (per gene)" + doc: | + ATAC fragment coverage. + All genes of interest. + PNG format. + "sd:visualPlugins": + - image: + tab: "Genes of interest (coverage)" + Caption: "ATAC fragment coverage (per gene)" + + xpr_htmp_plot_png: + type: File? + outputSource: rna_azimuth/xpr_htmp_plot_png + label: "Gene expression heatmap (top gene markers)" + doc: | + Gene expression heatmap. + Top gene markers. + PNG format. + "sd:visualPlugins": + - image: + tab: "Gene markers heatmap" + Caption: "Gene expression heatmap (top gene markers)" + + xpr_htmp_tsv: + type: File? + outputSource: rna_azimuth/xpr_htmp_tsv + label: "Gene expression heatmap (top gene markers)" + doc: | + Gene expression heatmap. + Top gene markers. + TSV format. + + gene_markers_tsv: + type: File? + outputSource: rna_azimuth/gene_markers_tsv + label: "Gene markers" + doc: | + Gene markers. + TSV format. + "sd:visualPlugins": + - syncfusiongrid: + tab: "Gene markers table" + Title: "Gene markers" + + peak_markers_tsv: + type: File? + outputSource: rna_azimuth/peak_markers_tsv + label: "Peak markers" + doc: | + Peak markers. + TSV format. + "sd:visualPlugins": + - syncfusiongrid: + tab: "Peak markers table" + Title: "Peak markers" + + ucsc_cb_html_data: + type: Directory? + outputSource: rna_azimuth/ucsc_cb_html_data + label: "UCSC Cell Browser (data)" + doc: | + UCSC Cell Browser html data. + + ucsc_cb_html_file: + type: File? + outputSource: rna_azimuth/ucsc_cb_html_file + label: "UCSC Cell Browser" + doc: | + UCSC Cell Browser html index. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + + seurat_data_rds: + type: File + outputSource: rna_azimuth/seurat_data_rds + label: "Seurat object in RDS format" + doc: | + Seurat object. + RDS format. + + seurat_data_scope: + type: File? + outputSource: rna_azimuth/seurat_data_scope + label: "Seurat object in SCope compatible loom format" + doc: | + Seurat object. + SCope compatible. + Loom format. + + seurat_rna_data_cloupe: + type: File? + outputSource: rna_azimuth/seurat_rna_data_cloupe + label: "Seurat object in Loupe format" + doc: | + Seurat object. + RNA counts. + Loupe format. + + pdf_plots: + type: File + outputSource: compress_pdf_plots/compressed_folder + label: "Compressed folder with all PDF plots" + doc: | + Compressed folder with all PDF plots. + + sc_report_html_file: + type: File? + outputSource: rna_azimuth/sc_report_html_file + label: "Analysis log" + doc: | + Tehcnical report. + HTML format. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + + rna_azimuth_stdout_log: + type: File + outputSource: rna_azimuth/stdout_log + label: "Output log" + doc: | + Stdout log from the rna_azimuth step. + + rna_azimuth_stderr_log: + type: File + outputSource: rna_azimuth/stderr_log + label: "Error log" + doc: | + Stderr log from the rna_azimuth step. + + +steps: + + rna_azimuth: + run: ../tools/sc-rna-azimuth.cwl + in: + query_data_rds: query_data_rds + reference_data_rds: reference_data_rds + reference_data_index: reference_data_index + reference_source_column: reference_source_column + atac_fragments_file: atac_fragments_file + genes_of_interest: + source: genes_of_interest + valueFrom: $(split_features(self)) + identify_diff_genes: identify_diff_genes + identify_diff_peaks: identify_diff_peaks + rna_minimum_logfc: + default: 0.25 + rna_minimum_pct: + default: 0.1 + atac_minimum_logfc: + default: 0.25 + atac_minimum_pct: + default: 0.05 + only_positive_diff_genes: + default: true + rna_test_to_use: + default: wilcox + atac_test_to_use: + default: LR + verbose: + default: true + export_ucsc_cb: + default: true + export_scope_data: + default: true + export_loupe_data: export_loupe_data + export_pdf_plots: + default: true + color_theme: color_theme + parallel_memory_limit: + default: 32 + vector_memory_limit: + default: 128 + export_html_report: export_html_report + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - cell_cnts_gr_ctyp_plot_png + - umap_qc_mtrcs_plot_png + - gene_umi_spl_ctyp_plot_png + - umi_mito_spl_ctyp_plot_png + - rnadbl_gr_ctyp_plot_png + - tss_frgm_spl_ctyp_plot_png + - atacdbl_gr_ctyp_plot_png + - rna_atac_cnts_spl_ctyp_plot_png + - vrlpdbl_gr_ctyp_plot_png + - qc_mtrcs_dnst_gr_ctyp_plot_png + - umap_gr_ctyp_plot_png + - umap_gr_ctyp_spl_idnt_plot_png + - cmp_gr_ctyp_spl_idnt_plot_png + - umap_gr_ph_spl_idnt_plot_png + - cmp_gr_ph_spl_idnt_plot_png + - umap_gr_ctyp_spl_ph_png + - cmp_gr_ph_spl_ctyp_png + - umap_gr_ctyp_spl_cnd_plot_png + - cmp_gr_ctyp_spl_cnd_plot_png + - umap_gr_ph_spl_cnd_plot_png + - cmp_gr_ph_spl_cnd_plot_png + - xpr_avg_plot_png + - xpr_per_cell_plot_png + - xpr_dnst_plot_png + - xpr_htmp_plot_png + - cvrg_plot_png + - all_plots_pdf + - xpr_htmp_tsv + - gene_markers_tsv + - peak_markers_tsv + - ucsc_cb_html_data + - ucsc_cb_html_file + - seurat_data_rds + - seurat_data_scope + - seurat_rna_data_cloupe + - sc_report_html_file + - stdout_log + - stderr_log + + folder_pdf_plots: + run: ../tools/files-to-folder.cwl + in: + input_files: + source: + - rna_azimuth/all_plots_pdf + valueFrom: $(self.flat().filter(n => n)) + folder_basename: + default: "pdf_plots" + out: + - folder + + compress_pdf_plots: + run: ../tools/tar-compress.cwl + in: + folder_to_compress: folder_pdf_plots/folder + out: + - compressed_folder + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Single-Cell RNA-Seq Reference Mapping" +s:name: "Single-Cell RNA-Seq Reference Mapping" +s:alternateName: "Single-Cell RNA-Seq Reference Mapping" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows-datirium/master/workflows/sc-rna-azimuth.cwl +s:codeRepository: https://github.com/Barski-lab/workflows-datirium +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-Cell RNA-Seq Reference Mapping + + Assigns identities to cells based on the reference annotation + using the Azimuth R package. Reference models can be downloaded + from the https://azimuth.hubmapconsortium.org/ website. This + workflow can be run with the outputs of the following pipelines: + "Single-Cell RNA-Seq Filtering Analysis", "Single-Cell Multiome + ATAC-Seq and RNA-Seq Filtering Analysis", "Single-Cell RNA-Seq + Dimensionality Reduction Analysis", "Single-Cell RNA-Seq Cluster + Analysis", and "Single-Cell WNN Cluster Analysis". It can also be + used with the outputs of: "Single-Cell ATAC-Seq Dimensionality + Reduction Analysis", "Single-Cell ATAC-Seq Cluster Analysis", + "Single-Cell Manual Cell Type Assignment" pipelines if these were + part of the multiome data analysis. The results of this workflow + are compatible with any single cell pipeline normally used after + the "Single-Cell RNA-Seq Filtering Analysis" or "Single-Cell + Multiome ATAC-Seq and RNA-Seq Filtering Analysis" pipelines, + depending on the preceding analysis step. In other words, this + pipeline predicts cell types for high-quality cells without + impacting subsequent data analysis steps. \ No newline at end of file diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index d248d789..c3cfdc51 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -47,6 +47,7 @@ requirements: - "sc-atac-cluster.cwl" - "sc-rna-reduce.cwl" - "sc-atac-reduce.cwl" + - "sc-rna-azimuth.cwl" inputs: diff --git a/workflows/sc-rna-da-cells.cwl b/workflows/sc-rna-da-cells.cwl index 5c667aa6..6c71a37b 100644 --- a/workflows/sc-rna-da-cells.cwl +++ b/workflows/sc-rna-da-cells.cwl @@ -22,7 +22,7 @@ requirements: - "sc-atac-cluster.cwl" - "sc-wnn-cluster.cwl" - "sc-ctype-assign.cwl" - - "sc-rna-de-pseudobulk.cwl" + - "sc-rna-azimuth.cwl" inputs: diff --git a/workflows/sc-rna-de-pseudobulk.cwl b/workflows/sc-rna-de-pseudobulk.cwl index 4cac69f9..6e11cbad 100644 --- a/workflows/sc-rna-de-pseudobulk.cwl +++ b/workflows/sc-rna-de-pseudobulk.cwl @@ -30,6 +30,7 @@ requirements: - "sc-ctype-assign.cwl" - "sc-wnn-cluster.cwl" - "sc-rna-da-cells.cwl" + - "sc-rna-azimuth.cwl" inputs: diff --git a/workflows/sc-rna-reduce.cwl b/workflows/sc-rna-reduce.cwl index 543d1968..2d8bdd64 100644 --- a/workflows/sc-rna-reduce.cwl +++ b/workflows/sc-rna-reduce.cwl @@ -23,6 +23,7 @@ requirements: - "sc-atac-reduce.cwl" - "sc-rna-filter.cwl" - "sc-multiome-filter.cwl" + - "sc-rna-azimuth.cwl" inputs: diff --git a/workflows/sc-rna-trajectory.cwl b/workflows/sc-rna-trajectory.cwl index 6af422a4..764ed7f2 100644 --- a/workflows/sc-rna-trajectory.cwl +++ b/workflows/sc-rna-trajectory.cwl @@ -23,6 +23,7 @@ requirements: - "sc-atac-cluster.cwl" - "sc-wnn-cluster.cwl" - "sc-ctype-assign.cwl" + - "sc-rna-azimuth.cwl" inputs: diff --git a/workflows/sc-triangulate.cwl b/workflows/sc-triangulate.cwl index 923aac35..f5055d25 100644 --- a/workflows/sc-triangulate.cwl +++ b/workflows/sc-triangulate.cwl @@ -28,6 +28,7 @@ requirements: - "sc-rna-cluster.cwl" - "sc-atac-cluster.cwl" - "sc-wnn-cluster.cwl" + - "sc-rna-azimuth.cwl" inputs: diff --git a/workflows/sc-vdj-profile.cwl b/workflows/sc-vdj-profile.cwl index b34ad630..4a93302c 100644 --- a/workflows/sc-vdj-profile.cwl +++ b/workflows/sc-vdj-profile.cwl @@ -14,6 +14,7 @@ requirements: - "sc-rna-reduce.cwl" - "sc-rna-cluster.cwl" - "sc-ctype-assign.cwl" + - "sc-rna-azimuth.cwl" sc_vdj_sample: - "cellranger-multi.cwl" - "cellranger-aggr.cwl" diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index a0ecfa49..fbcea837 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -48,6 +48,7 @@ requirements: - "sc-atac-cluster.cwl" - "sc-rna-reduce.cwl" - "sc-atac-reduce.cwl" + - "sc-rna-azimuth.cwl" sc_arc_sample: - "cellranger-arc-count.cwl" - "cellranger-arc-aggr.cwl" From f8395549bf4091e6806ecef19d5305af171f94ae Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Wed, 18 Sep 2024 16:26:44 -0400 Subject: [PATCH 155/162] Update sc rna azimuth pipeline with min conf and min map score thresholds --- tools/sc-rna-azimuth.cwl | 324 ++++++++++++++++++-------- workflows/sc-rna-azimuth.cwl | 430 ++++++++++++++++++++++++----------- 2 files changed, 522 insertions(+), 232 deletions(-) diff --git a/tools/sc-rna-azimuth.cwl b/tools/sc-rna-azimuth.cwl index 280b6ea8..5bb33b7d 100644 --- a/tools/sc-rna-azimuth.cwl +++ b/tools/sc-rna-azimuth.cwl @@ -56,6 +56,37 @@ inputs: Column from the metadata of the reference Seurat object to select the reference annotations. + minimum_confidence_score: + type: float? + inputBinding: + prefix: "--minconfscore" + doc: | + The minimum threshold for a prediction + confidence score is calculated at the cell + level. This metric ranges from 0 to 1 and + reflects the confidence associated with each + annotation. Only cells that meet both the + minimum prediction confidence score and the + minimum prediction mapping score thresholds + will be included in the analysis. + Default: 0.75 + + minimum_mapping_score: + type: float? + inputBinding: + prefix: "--minmapscore" + doc: | + The minimum threshold for a prediction + mapping score is calculated at the cell. + This metric ranges from 0 to 1 and reflects + how well the unique structure of a cell's + local neighborhood is preserved during + reference mapping. Only cells that meet both + the minimum prediction mapping score and the + minimum prediction confidence score thresholds + will be included in the analysis. + Default: 0.75 + identify_diff_genes: type: boolean? inputBinding: @@ -355,106 +386,182 @@ inputs: outputs: + ref_cell_cnts_gr_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_ref_cell_cnts_gr_ctyp.png" + doc: | + Number of cells per cell type. + All reference cells. + PNG format. + + ref_umap_gr_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_ref_umap_gr_ctyp.png" + doc: | + Reference UMAP colored by cell type. + All reference cells. + PNG format. + cell_cnts_gr_ctyp_plot_png: type: File? outputBinding: - glob: "*_cell_cnts_gr_ctyp.png" + glob: "*[!_ref]_cell_cnts_gr_ctyp.png" doc: | Number of cells per cell type. - All cells. + All query cells. + PNG format. + + umap_cnf_plot_png: + type: File? + outputBinding: + glob: "*_umap_cnf.png" + doc: | + Projected UMAP colored by + prediction confidence score. + All query cells. + PNG format. + + umap_map_plot_png: + type: File? + outputBinding: + glob: "*_umap_map.png" + doc: | + Projected UMAP colored by + prediction mapping score. + All query cells. + PNG format. + + qc_mtrcs_dnst_gr_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_qc_mtrcs_dnst_gr_ctyp.png" + doc: | + Distribution of QC metrics per + cell colored by cell type. + All query cells. PNG format. - umap_qc_mtrcs_plot_png: + gene_umi_gr_cnf_spl_ctyp_plot_png: type: File? outputBinding: - glob: "*_umap_qc_mtrcs.png" + glob: "*_gene_umi_gr_cnf_spl_ctyp.png" doc: | - UMAP, QC metrics. - All cells. + Genes vs RNA reads per cell. + All query cells; split by cell type; + colored by prediction confidence score. PNG format. - gene_umi_spl_ctyp_plot_png: + gene_umi_gr_map_spl_ctyp_plot_png: type: File? outputBinding: - glob: "*_gene_umi_spl_ctyp.png" + glob: "*_gene_umi_gr_map_spl_ctyp.png" doc: | Genes vs RNA reads per cell. - Split by cell type; all cells. + All query cells; split by cell type; + colored by prediction mapping score. PNG format. - umi_mito_spl_ctyp_plot_png: + umi_mito_gr_cnf_spl_ctyp_plot_png: type: File? outputBinding: - glob: "*_umi_mito_spl_ctyp.png" + glob: "*_umi_mito_gr_cnf_spl_ctyp.png" doc: | RNA reads vs mitochondrial % per cell. - Split by cell type; all cells. + All query cells; split by cell type; + colored by prediction confidence score. PNG format. - rnadbl_gr_ctyp_plot_png: + umi_mito_gr_map_spl_ctyp_plot_png: type: File? outputBinding: - glob: "*_rnadbl_gr_ctyp.png" + glob: "*_umi_mito_gr_map_spl_ctyp.png" doc: | - Percentage of RNA doublets per cell type. - All cells. + RNA reads vs mitochondrial % per cell. + All query cells; split by cell type; + colored by prediction mapping score. PNG format. - tss_frgm_spl_ctyp_plot_png: + tss_frgm_gr_cnf_spl_ctyp_plot_png: type: File? outputBinding: - glob: "*_tss_frgm_spl_ctyp.png" + glob: "*_tss_frgm_gr_cnf_spl_ctyp.png" doc: | - TSS enrichment score vs ATAC - fragments in peaks per cell. - Split by cell type; all cells. + TSS enrichment score vs ATAC fragments + in peaks per cell. + All query cells; split by cell type; + colored by prediction confidence score. PNG format. - atacdbl_gr_ctyp_plot_png: + tss_frgm_gr_map_spl_ctyp_plot_png: type: File? outputBinding: - glob: "*_atacdbl_gr_ctyp.png" + glob: "*_tss_frgm_gr_map_spl_ctyp.png" doc: | - Percentage of ATAC doublets per cell type. - All cells. + TSS enrichment score vs ATAC fragments + in peaks per cell. + All query cells; split by cell type; + colored by prediction mapping score. PNG format. - rna_atac_cnts_spl_ctyp_plot_png: + rna_atac_cnts_gr_cnf_spl_ctyp_plot_png: type: File? outputBinding: - glob: "*_rna_atac_cnts_spl_ctyp.png" + glob: "*_rna_atac_cnts_gr_cnf_spl_ctyp.png" doc: | RNA reads vs ATAC fragments in peaks per cell. - Split by cell type; all cells. + All query cells; split by cell type; colored + by prediction confidence score. PNG format. - vrlpdbl_gr_ctyp_plot_png: + rna_atac_cnts_gr_map_spl_ctyp_plot_png: type: File? outputBinding: - glob: "*_vrlpdbl_gr_ctyp.png" + glob: "*_rna_atac_cnts_gr_map_spl_ctyp.png" + doc: | + RNA reads vs ATAC fragments in peaks per cell. + All query cells; split by cell type; colored + by prediction mapping score. + PNG format. + + rnadbl_gr_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_rnadbl_gr_ctyp.png" doc: | - Percentage of RNA and ATAC doublets + Percentage of RNA doublets per cell type. - All cells. + All query cells. PNG format. - qc_mtrcs_dnst_gr_ctyp_plot_png: + atacdbl_gr_ctyp_plot_png: type: File? outputBinding: - glob: "*_qc_mtrcs_dnst_gr_ctyp.png" + glob: "*_atacdbl_gr_ctyp.png" doc: | - Distribution of QC metrics per cell - colored by cell type. - All cells. + Percentage of ATAC doublets + per cell type. + All query cells. + PNG format. + + vrlpdbl_gr_ctyp_plot_png: + type: File? + outputBinding: + glob: "*_vrlpdbl_gr_ctyp.png" + doc: | + Percentage of RNA and ATAC + doublets per cell type. + All query cells. PNG format. umap_gr_ctyp_plot_png: type: File? outputBinding: - glob: "*_umap_gr_ctyp.png" + glob: "*[!_ref]_umap_gr_ctyp.png" doc: | - UMAP colored by cell type. - All cells. + Projected UMAP colored by cell type. + Filtered query cells. PNG format. umap_gr_ctyp_spl_idnt_plot_png: @@ -462,9 +569,9 @@ outputs: outputBinding: glob: "*_umap_gr_ctyp_spl_idnt.png" doc: | - UMAP colored by cell type. - Split by dataset; downsampled to the - smallest dataset. + Projected UMAP colored by cell type. + Filtered query cells; split by dataset; + downsampled to the smallest dataset. PNG format. cmp_gr_ctyp_spl_idnt_plot_png: @@ -473,8 +580,8 @@ outputs: glob: "*_cmp_gr_ctyp_spl_idnt.png" doc: | Composition plot colored by cell type. - Split by dataset; downsampled to the - smallest dataset. + Filtered query cells; split by dataset; + downsampled to the smallest dataset. PNG format. cmp_gr_idnt_spl_ctyp_plot_png: @@ -483,8 +590,8 @@ outputs: glob: "*_cmp_gr_idnt_spl_ctyp.png" doc: | Composition plot colored by dataset. - Split by cell type; downsampled to - the smallest dataset. + Filtered query cells; split by cell type; + downsampled to the smallest dataset. PNG format. umap_gr_ph_spl_idnt_plot_png: @@ -492,9 +599,9 @@ outputs: outputBinding: glob: "*_umap_gr_ph_spl_idnt.png" doc: | - UMAP colored by cell cycle phase. - Split by dataset; downsampled to the - smallest dataset. + Projected UMAP colored by cell cycle phase. + Filtered query cells; split by dataset; + downsampled to the smallest dataset. PNG format. cmp_gr_ph_spl_idnt_plot_png: @@ -503,8 +610,8 @@ outputs: glob: "*_cmp_gr_ph_spl_idnt.png" doc: | Composition plot colored by cell cycle phase. - Split by dataset; downsampled to the smallest - dataset. + Filtered query cells; split by dataset; + downsampled to the smallest dataset. PNG format. umap_gr_ctyp_spl_ph_png: @@ -512,10 +619,11 @@ outputs: outputBinding: glob: "*_umap_gr_ctyp_spl_ph.png" doc: | - UMAP colored by cell type. - Split by cell cycle phase; downsampled - to the smallest dataset (if multiple - datasets are analyzed jointly). + Projected UMAP colored by cell type. + Filtered query cells; split by cell + cycle phase; downsampled to the + smallest dataset (if multiple datasets + are analyzed jointly). PNG format. cmp_gr_ph_spl_ctyp_png: @@ -524,9 +632,9 @@ outputs: glob: "*_cmp_gr_ph_spl_ctyp.png" doc: | Composition plot colored by cell cycle phase. - Split by cell type; downsampled to the - smallest dataset (if multiple datasets are - analyzed jointly). + Filtered query cells; split by cell type; + downsampled to the smallest dataset (if + multiple datasets are analyzed jointly). PNG format. umap_gr_ctyp_spl_cnd_plot_png: @@ -534,9 +642,10 @@ outputs: outputBinding: glob: "*_umap_gr_ctyp_spl_cnd.png" doc: | - UMAP colored by cell type. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to + Projected UMAP colored by cell type. + Filtered query cells; split by grouping + condition; first downsampled to the + smallest dataset, then downsampled to the smallest group. PNG format. @@ -546,8 +655,9 @@ outputs: glob: "*_cmp_gr_ctyp_spl_cnd.png" doc: | Composition plot colored by cell type. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to + Filtered query cells; split by grouping + condition; first downsampled to the + smallest dataset, then downsampled to the smallest group. PNG format. @@ -557,9 +667,9 @@ outputs: glob: "*_cmp_gr_cnd_spl_ctyp.png" doc: | Composition plot colored by grouping condition. - Split by cell type; first downsampled to the - smallest dataset, then downsampled to the - smallest group. + Filtered query cells; split by cell type; + first downsampled to the smallest dataset, + then downsampled to the smallest group. PNG format. umap_gr_ph_spl_cnd_plot_png: @@ -567,10 +677,10 @@ outputs: outputBinding: glob: "*_umap_gr_ph_spl_cnd.png" doc: | - UMAP colored by cell cycle phase. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group. + Projected UMAP colored by cell cycle phase. + Filtered query cells; split by grouping + condition; first downsampled to the smallest + dataset, then downsampled to the smallest group. PNG format. cmp_gr_ph_spl_cnd_plot_png: @@ -579,9 +689,9 @@ outputs: glob: "*_cmp_gr_ph_spl_cnd.png" doc: | Composition plot colored by cell cycle phase. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group. + Filtered query cells; split by grouping condition; + first downsampled to the smallest dataset, then + downsampled to the smallest group. PNG format. xpr_avg_plot_png: @@ -590,6 +700,16 @@ outputs: glob: "*_xpr_avg.png" doc: | Average gene expression. + Filtered query cells. + PNG format. + + xpr_dnst_plot_png: + type: File? + outputBinding: + glob: "*_xpr_dnst.png" + doc: | + Gene expression density. + Filtered query cells. PNG format. xpr_per_cell_plot_png: @@ -600,8 +720,8 @@ outputs: outputBinding: glob: "*_xpr_per_cell_[!sgnl_]*.png" doc: | - UMAP colored by gene expression. - All genes of interest. + Projected UMAP colored by gene expression. + Filtered query cells; all genes of interest. PNG format. xpr_per_cell_sgnl_plot_png: @@ -612,16 +732,10 @@ outputs: outputBinding: glob: "*_xpr_per_cell_sgnl_*.png" doc: | - UMAP colored by gene expression density. - All genes of interest. - PNG format. - - xpr_dnst_plot_png: - type: File? - outputBinding: - glob: "*_xpr_dnst.png" - doc: | - Gene expression density. + Projected UMAP colored by gene + expression density. + Filtered query cells; all genes + of interest. PNG format. xpr_htmp_plot_png: @@ -629,7 +743,8 @@ outputs: outputBinding: glob: "*_xpr_htmp.png" doc: | - Gene expression heatmap. + Gene expression heatmap from + the filtered query cells. Top gene markers. PNG format. @@ -642,7 +757,8 @@ outputs: glob: "*_cvrg_*.png" doc: | ATAC fragment coverage. - All genes of interest. + Filtered query cells; + all genes of interest. PNG format. all_plots_pdf: @@ -661,7 +777,8 @@ outputs: outputBinding: glob: "*_xpr_htmp.tsv" doc: | - Gene expression heatmap. + Gene expression heatmap from + the filtered query cells. Top gene markers. TSV format. @@ -670,7 +787,8 @@ outputs: outputBinding: glob: "*_gene_markers.tsv" doc: | - Gene markers. + Gene markers from the filtered + query cells. TSV format. peak_markers_tsv: @@ -678,7 +796,8 @@ outputs: outputBinding: glob: "*_peak_markers.tsv" doc: | - Peak markers. + Peak markers from the filtered + query cells. TSV format. ucsc_cb_config_data: @@ -834,8 +953,10 @@ doc: | s:about: | - usage: /tmp/sc_tools/sc_rna_azimuth.R [-h] --query QUERY --reference REFERENCE - --annoyidx ANNOYIDX --source SOURCE + usage: /usr/local/bin/sc_rna_azimuth.R [-h] --query QUERY --reference + REFERENCE --annoyidx ANNOYIDX --source + SOURCE [--minconfscore MINCONFSCORE] + [--minmapscore MINMAPSCORE] [--diffgenes] [--diffpeaks] [--rnalogfc RNALOGFC] [--rnaminpct RNAMINPCT] [--rnaonlypos] @@ -874,6 +995,23 @@ s:about: | https://azimuth.hubmapconsortium.org/references/ --source SOURCE Column from the metadata of the reference Seurat object to select the reference annotations. + --minconfscore MINCONFSCORE + The minimum threshold for a prediction confidence + score is calculated at the cell level. This metric + ranges from 0 to 1 and reflects the confidence + associated with each annotation. Only cells that meet + both the minimum prediction confidence score and the + minimum prediction mapping score thresholds will be + included in the analysis. Default: 0.75 + --minmapscore MINMAPSCORE + The minimum threshold for a prediction mapping score + is calculated at the cell. This metric ranges from 0 + to 1 and reflects how well the unique structure of a + cell’s local neighborhood is preserved during + reference mapping. Only cells that meet both the + minimum prediction mapping score and the minimum + prediction confidence score thresholds will be + included in the analysis. Default: 0.75 --diffgenes Identify differentially expressed genes (putative gene markers) for the predicted cell types. Default: false --diffpeaks Identify differentially accessible peaks for the diff --git a/workflows/sc-rna-azimuth.cwl b/workflows/sc-rna-azimuth.cwl index a2187a2d..33b79fc2 100644 --- a/workflows/sc-rna-azimuth.cwl +++ b/workflows/sc-rna-azimuth.cwl @@ -75,6 +75,37 @@ inputs: Column from the metadata of the reference Seurat object to select the reference annotations. + minimum_confidence_score: + type: float? + default: 0.75 + label: "Minimum prediction confidence score" + doc: | + The minimum threshold for a prediction + confidence score is calculated at the cell + level. This metric ranges from 0 to 1 and + reflects the confidence associated with each + annotation. Only cells that meet both the + minimum prediction confidence score and the + minimum prediction mapping score thresholds + will be included in the analysis. + Default: 0.75 + + minimum_mapping_score: + type: float? + default: 0.75 + label: "Minimum prediction mapping score" + doc: | + The minimum threshold for a prediction + mapping score is calculated at the cell. + This metric ranges from 0 to 1 and reflects + how well the unique structure of a cell's + local neighborhood is preserved during + reference mapping. Only cells that meet both + the minimum prediction mapping score and the + minimum prediction confidence score thresholds + will be included in the analysis. + Default: 0.75 + identify_diff_genes: type: boolean? default: true @@ -199,334 +230,441 @@ inputs: - "4" - "5" - "6" - default: "6" + default: "4" label: "Cores/CPUs" doc: | Parallelization parameter to define the number of cores/CPUs that can be utilized simultaneously. - Default: 6 + Default: 4 "sd:layout": advanced: true outputs: + ref_cell_cnts_gr_ctyp_plot_png: + type: File? + outputSource: rna_azimuth/ref_cell_cnts_gr_ctyp_plot_png + label: "Number of cells per cell type (all reference cells)" + doc: | + Number of cells per cell type. + All reference cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "Reference" + Caption: "Number of cells per cell type (all reference cells)" + + ref_umap_gr_ctyp_plot_png: + type: File? + outputSource: rna_azimuth/ref_umap_gr_ctyp_plot_png + label: "Reference UMAP colored by cell type (all reference cells)" + doc: | + Reference UMAP colored by cell type. + All reference cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "Reference" + Caption: "Reference UMAP colored by cell type (all reference cells)" + cell_cnts_gr_ctyp_plot_png: type: File? outputSource: rna_azimuth/cell_cnts_gr_ctyp_plot_png - label: "Number of cells per cell type (all cells)" + label: "Number of cells per cell type (all query cells)" doc: | Number of cells per cell type. - All cells. + All query cells. PNG format. "sd:visualPlugins": - image: tab: "QC" - Caption: "Number of cells per cell type (all cells)" + Caption: "Number of cells per cell type (all query cells)" - umap_qc_mtrcs_plot_png: + qc_mtrcs_dnst_gr_ctyp_plot_png: type: File? - outputSource: rna_azimuth/umap_qc_mtrcs_plot_png - label: "UMAP, QC metrics (all cells)" + outputSource: rna_azimuth/qc_mtrcs_dnst_gr_ctyp_plot_png + label: "Distribution of QC metrics per cell colored by cell type (all query cells)" doc: | - UMAP, QC metrics. - All cells. + Distribution of QC metrics per + cell colored by cell type. + All query cells. PNG format. "sd:visualPlugins": - image: tab: "QC" - Caption: "UMAP, QC metrics (all cells)" + Caption: "Distribution of QC metrics per cell colored by cell type (all query cells)" - qc_mtrcs_dnst_gr_ctyp_plot_png: + umap_cnf_plot_png: type: File? - outputSource: rna_azimuth/qc_mtrcs_dnst_gr_ctyp_plot_png - label: "Distribution of QC metrics per cell colored by cell type (all cells)" + outputSource: rna_azimuth/umap_cnf_plot_png + label: "Projected UMAP colored by prediction confidence score (all query cells)" doc: | - Distribution of QC metrics per cell - colored by cell type. - All cells. + Projected UMAP colored by + prediction confidence score. + All query cells. PNG format. "sd:visualPlugins": - image: tab: "QC" - Caption: "Distribution of QC metrics per cell colored by cell type (all cells)" + Caption: "Projected UMAP colored by prediction confidence score (all query cells)" - gene_umi_spl_ctyp_plot_png: + umap_map_plot_png: type: File? - outputSource: rna_azimuth/gene_umi_spl_ctyp_plot_png - label: "Genes vs RNA reads per cell (split by cell type, all cells)" + outputSource: rna_azimuth/umap_map_plot_png + label: "Projected UMAP colored by prediction mapping score (all query cells)" + doc: | + Projected UMAP colored by + prediction mapping score. + All query cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Projected UMAP colored by prediction mapping score (all query cells)" + + gene_umi_gr_cnf_spl_ctyp_plot_png: + type: File? + outputSource: rna_azimuth/gene_umi_gr_cnf_spl_ctyp_plot_png + label: "Genes vs RNA reads per cell colored by prediction confidence score (split by cell type, all query cells)" doc: | Genes vs RNA reads per cell. - Split by cell type; all cells. + All query cells; split by cell type; + colored by prediction confidence score. PNG format. "sd:visualPlugins": - image: tab: "QC" - Caption: "Genes vs RNA reads per cell (split by cell type, all cells)" + Caption: "Genes vs RNA reads per cell colored by prediction confidence score (split by cell type, all query cells)" - umi_mito_spl_ctyp_plot_png: + gene_umi_gr_map_spl_ctyp_plot_png: type: File? - outputSource: rna_azimuth/umi_mito_spl_ctyp_plot_png - label: "RNA reads vs mitochondrial % per cell (split by cell type, all cells)" + outputSource: rna_azimuth/gene_umi_gr_map_spl_ctyp_plot_png + label: "Genes vs RNA reads per cell colored by prediction mapping score (split by cell type, all query cells)" + doc: | + Genes vs RNA reads per cell. + All query cells; split by cell type; + colored by prediction mapping score. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "Genes vs RNA reads per cell colored by prediction mapping score (split by cell type, all query cells)" + + umi_mito_gr_cnf_spl_ctyp_plot_png: + type: File? + outputSource: rna_azimuth/umi_mito_gr_cnf_spl_ctyp_plot_png + label: "RNA reads vs mitochondrial % per cell colored by prediction confidence score (split by cell type, all query cells)" doc: | RNA reads vs mitochondrial % per cell. - Split by cell type; all cells. + All query cells; split by cell type; + colored by prediction confidence score. PNG format. "sd:visualPlugins": - image: tab: "QC" - Caption: "RNA reads vs mitochondrial % per cell (split by cell type, all cells)" + Caption: "RNA reads vs mitochondrial % per cell colored by prediction confidence score (split by cell type, all query cells)" - tss_frgm_spl_ctyp_plot_png: + umi_mito_gr_map_spl_ctyp_plot_png: type: File? - outputSource: rna_azimuth/tss_frgm_spl_ctyp_plot_png - label: "TSS enrichment score vs ATAC fragments in peaks per cell (split by cell type, all cells)" + outputSource: rna_azimuth/umi_mito_gr_map_spl_ctyp_plot_png + label: "RNA reads vs mitochondrial % per cell colored by prediction mapping score (split by cell type, all query cells)" doc: | - TSS enrichment score vs ATAC - fragments in peaks per cell. - Split by cell type; all cells. + RNA reads vs mitochondrial % per cell. + All query cells; split by cell type; + colored by prediction mapping score. PNG format. "sd:visualPlugins": - image: tab: "QC" - Caption: "TSS enrichment score vs ATAC fragments in peaks per cell (split by cell type, all cells)" + Caption: "RNA reads vs mitochondrial % per cell colored by prediction mapping score (split by cell type, all query cells)" - rna_atac_cnts_spl_ctyp_plot_png: + tss_frgm_gr_cnf_spl_ctyp_plot_png: type: File? - outputSource: rna_azimuth/rna_atac_cnts_spl_ctyp_plot_png - label: "RNA reads vs ATAC fragments in peaks per cell (split by cell type, all cells)" + outputSource: rna_azimuth/tss_frgm_gr_cnf_spl_ctyp_plot_png + label: "TSS enrichment score vs ATAC fragments in peaks per cell colored by prediction confidence score (split by cell type, all query cells)" + doc: | + TSS enrichment score vs ATAC fragments + in peaks per cell. + All query cells; split by cell type; + colored by prediction confidence score. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "TSS enrichment score vs ATAC fragments in peaks per cell colored by prediction confidence score (split by cell type, all query cells)" + + tss_frgm_gr_map_spl_ctyp_plot_png: + type: File? + outputSource: rna_azimuth/tss_frgm_gr_map_spl_ctyp_plot_png + label: "TSS enrichment score vs ATAC fragments in peaks per cell colored by prediction mapping score (split by cell type, all query cells)" + doc: | + TSS enrichment score vs ATAC fragments + in peaks per cell. + All query cells; split by cell type; + colored by prediction mapping score. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "TSS enrichment score vs ATAC fragments in peaks per cell colored by prediction mapping score (split by cell type, all query cells)" + + rna_atac_cnts_gr_cnf_spl_ctyp_plot_png: + type: File? + outputSource: rna_azimuth/rna_atac_cnts_gr_cnf_spl_ctyp_plot_png + label: "RNA reads vs ATAC fragments in peaks per cell colored by prediction confidence score (split by cell type, all query cells)" doc: | RNA reads vs ATAC fragments in peaks per cell. - Split by cell type; all cells. + All query cells; split by cell type; colored + by prediction confidence score. PNG format. "sd:visualPlugins": - image: tab: "QC" - Caption: "RNA reads vs ATAC fragments in peaks per cell (split by cell type, all cells)" + Caption: "RNA reads vs ATAC fragments in peaks per cell colored by prediction confidence score (split by cell type, all query cells)" + + rna_atac_cnts_gr_map_spl_ctyp_plot_png: + type: File? + outputSource: rna_azimuth/rna_atac_cnts_gr_map_spl_ctyp_plot_png + label: "RNA reads vs ATAC fragments in peaks per cell colored by prediction mapping score (split by cell type, all query cells)" + doc: | + RNA reads vs ATAC fragments in peaks per cell. + All query cells; split by cell type; colored + by prediction mapping score. + PNG format. + "sd:visualPlugins": + - image: + tab: "QC" + Caption: "RNA reads vs ATAC fragments in peaks per cell colored by prediction mapping score (split by cell type, all query cells)" rnadbl_gr_ctyp_plot_png: type: File? outputSource: rna_azimuth/rnadbl_gr_ctyp_plot_png - label: "Percentage of RNA doublets per cell type (all cells)" + label: "Percentage of RNA doublets per cell type (all query cells)" doc: | - Percentage of RNA doublets per cell type. - All cells. + Percentage of RNA doublets + per cell type. + All query cells. PNG format. "sd:visualPlugins": - image: tab: "QC" - Caption: "Percentage of RNA doublets per cell type (all cells)" + Caption: "Percentage of RNA doublets per cell type (all query cells)" atacdbl_gr_ctyp_plot_png: type: File? outputSource: rna_azimuth/atacdbl_gr_ctyp_plot_png - label: "Percentage of ATAC doublets per cell type (all cells)" + label: "Percentage of ATAC doublets per cell type (all query cells)" doc: | - Percentage of ATAC doublets per cell type. - All cells. + Percentage of ATAC doublets + per cell type. + All query cells. PNG format. "sd:visualPlugins": - image: tab: "QC" - Caption: "Percentage of ATAC doublets per cell type (all cells)" + Caption: "Percentage of ATAC doublets per cell type (all query cells)" vrlpdbl_gr_ctyp_plot_png: type: File? outputSource: rna_azimuth/vrlpdbl_gr_ctyp_plot_png - label: "Percentage of RNA and ATAC doublets per cell type (all cells)" + label: "Percentage of RNA and ATAC doublets per cell type (all query cells)" doc: | - Percentage of RNA and ATAC doublets - per cell type. - All cells. + Percentage of RNA and ATAC + doublets per cell type. + All query cells. PNG format. "sd:visualPlugins": - image: tab: "QC" - Caption: "Percentage of RNA and ATAC doublets per cell type (all cells)" + Caption: "Percentage of RNA and ATAC doublets per cell type (all query cells)" umap_gr_ctyp_plot_png: type: File? outputSource: rna_azimuth/umap_gr_ctyp_plot_png - label: "UMAP colored by cell type (all cells)" + label: "Projected UMAP colored by cell type (filtered query cells)" doc: | - UMAP colored by cell type. - All cells. + Projected UMAP colored by cell type. + Filtered query cells. PNG format. "sd:visualPlugins": - image: tab: "Split by cell type" - Caption: "UMAP colored by cell type (all cells)" + Caption: "Projected UMAP colored by cell type (filtered query cells)" umap_gr_ctyp_spl_ph_png: type: File? outputSource: rna_azimuth/umap_gr_ctyp_spl_ph_png - label: "UMAP colored by cell type (split by cell cycle phase, optionally downsampled)" + label: "Projected UMAP colored by cell type (split by cell cycle phase, optionally downsampled filtered query cells)" doc: | - UMAP colored by cell type. - Split by cell cycle phase; downsampled - to the smallest dataset (if multiple - datasets are analyzed jointly). + Projected UMAP colored by cell type. + Filtered query cells; split by cell + cycle phase; downsampled to the + smallest dataset. PNG format. "sd:visualPlugins": - image: tab: "Split by cell type" - Caption: "UMAP colored by cell type (split by cell cycle phase, optionally downsampled)" + Caption: "Projected UMAP colored by cell type (split by cell cycle phase, optionally downsampled filtered query cells)" cmp_gr_ph_spl_ctyp_png: type: File? outputSource: rna_azimuth/cmp_gr_ph_spl_ctyp_png - label: "Composition plot colored by cell cycle phase (split by cell type, optionally downsampled)" + label: "Composition plot colored by cell cycle phase (split by cell type, optionally downsampled filtered query cells)" doc: | Composition plot colored by cell cycle phase. - Split by cell type; downsampled to the - smallest dataset (if multiple datasets are - analyzed jointly). + Filtered query cells; split by cell type; + downsampled to the smallest dataset. PNG format. "sd:visualPlugins": - image: tab: "Split by cell type" - Caption: "Composition plot colored by cell cycle phase (split by cell type, optionally downsampled)" + Caption: "Composition plot colored by cell cycle phase (split by cell type, optionally downsampled filtered query cells)" umap_gr_ctyp_spl_idnt_plot_png: type: File? outputSource: rna_azimuth/umap_gr_ctyp_spl_idnt_plot_png - label: "UMAP colored by cell type (split by dataset, downsampled)" + label: "Projected UMAP colored by cell type (split by dataset, downsampled filtered query cells)" doc: | - UMAP colored by cell type. - Split by dataset; downsampled to the - smallest dataset. + Projected UMAP colored by cell type. + Filtered query cells; split by dataset; + downsampled to the smallest dataset. PNG format. "sd:visualPlugins": - image: tab: "Split by dataset" - Caption: "UMAP colored by cell type (split by dataset, downsampled)" + Caption: "Projected UMAP colored by cell type (split by dataset, downsampled filtered query cells)" cmp_gr_ctyp_spl_idnt_plot_png: type: File? outputSource: rna_azimuth/cmp_gr_ctyp_spl_idnt_plot_png - label: "Composition plot colored by cell type (split by dataset, downsampled)" + label: "Composition plot colored by cell type (split by dataset, downsampled filtered query cells)" doc: | Composition plot colored by cell type. - Split by dataset; downsampled to the - smallest dataset. + Filtered query cells; split by dataset; + downsampled to the smallest dataset. PNG format. "sd:visualPlugins": - image: tab: "Split by dataset" - Caption: "Composition plot colored by cell type (split by dataset, downsampled)" + Caption: "Composition plot colored by cell type (split by dataset, downsampled filtered query cells)" umap_gr_ph_spl_idnt_plot_png: type: File? outputSource: rna_azimuth/umap_gr_ph_spl_idnt_plot_png - label: "UMAP colored by cell cycle phase (split by dataset, downsampled)" + label: "Projected UMAP colored by cell cycle phase (split by dataset, downsampled filtered query cells)" doc: | - UMAP colored by cell cycle phase. - Split by dataset; downsampled to the - smallest dataset. + Projected UMAP colored by cell cycle phase. + Filtered query cells; split by dataset; + downsampled to the smallest dataset. PNG format. "sd:visualPlugins": - image: tab: "Split by dataset" - Caption: "UMAP colored by cell cycle phase (split by dataset, downsampled)" + Caption: "Projected UMAP colored by cell cycle phase (split by dataset, downsampled filtered query cells)" cmp_gr_ph_spl_idnt_plot_png: type: File? outputSource: rna_azimuth/cmp_gr_ph_spl_idnt_plot_png - label: "Composition plot colored by cell cycle phase (split by dataset, downsampled)" + label: "Composition plot colored by cell cycle phase (split by dataset, downsampled filtered query cells)" doc: | Composition plot colored by cell cycle phase. - Split by dataset; downsampled to the smallest - dataset. + Filtered query cells; split by dataset; + downsampled to the smallest dataset. PNG format. "sd:visualPlugins": - image: tab: "Split by dataset" - Caption: "Composition plot colored by cell cycle phase (split by dataset, downsampled)" + Caption: "Composition plot colored by cell cycle phase (split by dataset, downsampled filtered query cells)" umap_gr_ctyp_spl_cnd_plot_png: type: File? outputSource: rna_azimuth/umap_gr_ctyp_spl_cnd_plot_png - label: "UMAP colored by cell type (split by grouping condition, downsampled)" + label: "Projected UMAP colored by cell type (split by grouping condition, downsampled filtered query cells)" doc: | - UMAP colored by cell type. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to + Projected UMAP colored by cell type. + Filtered query cells; split by grouping + condition; first downsampled to the + smallest dataset, then downsampled to the smallest group. PNG format. "sd:visualPlugins": - image: tab: "Split by group" - Caption: "UMAP colored by cell type (split by grouping condition, downsampled)" + Caption: "Projected UMAP colored by cell type (split by grouping condition, downsampled filtered query cells)" cmp_gr_ctyp_spl_cnd_plot_png: type: File? outputSource: rna_azimuth/cmp_gr_ctyp_spl_cnd_plot_png - label: "Composition plot colored by cell type (split by grouping condition, downsampled)" + label: "Composition plot colored by cell type (split by grouping condition, downsampled filtered query cells)" doc: | Composition plot colored by cell type. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to + Filtered query cells; split by grouping + condition; first downsampled to the + smallest dataset, then downsampled to the smallest group. PNG format. "sd:visualPlugins": - image: tab: "Split by group" - Caption: "Composition plot colored by cell type (split by grouping condition, downsampled)" + Caption: "Composition plot colored by cell type (split by grouping condition, downsampled filtered query cells)" umap_gr_ph_spl_cnd_plot_png: type: File? outputSource: rna_azimuth/umap_gr_ph_spl_cnd_plot_png - label: "UMAP colored by cell cycle phase (split by grouping condition, downsampled)" + label: "Projected UMAP colored by cell cycle phase (split by grouping condition, downsampled filtered query cells)" doc: | - UMAP colored by cell cycle phase. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group. + Projected UMAP colored by cell cycle phase. + Filtered query cells; split by grouping + condition; first downsampled to the smallest + dataset, then downsampled to the smallest group. PNG format. "sd:visualPlugins": - image: tab: "Split by group" - Caption: "UMAP colored by cell cycle phase (split by grouping condition, downsampled)" + Caption: "Projected UMAP colored by cell cycle phase (split by grouping condition, downsampled filtered query cells)" cmp_gr_ph_spl_cnd_plot_png: type: File? outputSource: rna_azimuth/cmp_gr_ph_spl_cnd_plot_png - label: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" + label: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled filtered query cells)" doc: | Composition plot colored by cell cycle phase. - Split by grouping condition; first downsampled - to the smallest dataset, then downsampled to - the smallest group. + Filtered query cells; split by grouping condition; + first downsampled to the smallest dataset, then + downsampled to the smallest group. PNG format. "sd:visualPlugins": - image: tab: "Split by group" - Caption: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled)" + Caption: "Composition plot colored by cell cycle phase (split by grouping condition, downsampled filtered query cells)" xpr_avg_plot_png: type: File? outputSource: rna_azimuth/xpr_avg_plot_png - label: "Average gene expression" + label: "Average gene expression (filtered query cells)" doc: | Average gene expression. + Filtered query cells. PNG format. "sd:visualPlugins": - image: tab: "Genes of interest (expression)" - Caption: "Average gene expression" + Caption: "Average gene expression (filtered query cells)" xpr_dnst_plot_png: type: File? outputSource: rna_azimuth/xpr_dnst_plot_png - label: "Gene expression density" + label: "Gene expression density (filtered query cells)" doc: | Gene expression density. + Filtered query cells. PNG format. "sd:visualPlugins": - image: tab: "Genes of interest (expression)" - Caption: "Gene expression density" + Caption: "Gene expression density (filtered query cells)" xpr_per_cell_plot_png: type: @@ -534,15 +672,15 @@ outputs: - type: array items: File outputSource: rna_azimuth/xpr_per_cell_plot_png - label: "UMAP colored by gene expression (per gene)" + label: "Projected UMAP colored by gene expression (per gene, filtered query cells)" doc: | - UMAP colored by gene expression. - All genes of interest. + Projected UMAP colored by gene expression. + Filtered query cells; all genes of interest. PNG format. "sd:visualPlugins": - image: tab: "Genes of interest (expression)" - Caption: "UMAP colored by gene expression (per gene)" + Caption: "Projected UMAP colored by gene expression (per gene, filtered query cells)" cvrg_plot_png: type: @@ -550,61 +688,66 @@ outputs: - type: array items: File outputSource: rna_azimuth/cvrg_plot_png - label: "ATAC fragment coverage (per gene)" + label: "ATAC fragment coverage (per gene, filtered query cells)" doc: | ATAC fragment coverage. - All genes of interest. + Filtered query cells; + all genes of interest. PNG format. "sd:visualPlugins": - image: tab: "Genes of interest (coverage)" - Caption: "ATAC fragment coverage (per gene)" + Caption: "ATAC fragment coverage (per gene, filtered query cells)" xpr_htmp_plot_png: type: File? outputSource: rna_azimuth/xpr_htmp_plot_png - label: "Gene expression heatmap (top gene markers)" + label: "Gene expression heatmap (top gene markers, filtered query cells)" doc: | - Gene expression heatmap. + Gene expression heatmap from + the filtered query cells. Top gene markers. PNG format. "sd:visualPlugins": - image: tab: "Gene markers heatmap" - Caption: "Gene expression heatmap (top gene markers)" + Caption: "Gene expression heatmap (top gene markers, filtered query cells)" xpr_htmp_tsv: type: File? outputSource: rna_azimuth/xpr_htmp_tsv - label: "Gene expression heatmap (top gene markers)" + label: "Gene expression heatmap (top gene markers, filtered query cells)" doc: | - Gene expression heatmap. + Gene expression heatmap from + the filtered query cells. Top gene markers. TSV format. gene_markers_tsv: type: File? outputSource: rna_azimuth/gene_markers_tsv - label: "Gene markers" + label: "Gene markers (filtered query cells)" doc: | - Gene markers. + Gene markers from the filtered + query cells. TSV format. "sd:visualPlugins": - syncfusiongrid: tab: "Gene markers table" - Title: "Gene markers" + Title: "Gene markers (filtered query cells)" peak_markers_tsv: type: File? outputSource: rna_azimuth/peak_markers_tsv - label: "Peak markers" + label: "Peak markers (filtered query cells)" doc: | - Peak markers. + Peak markers from the filtered + query cells. TSV format. "sd:visualPlugins": - syncfusiongrid: tab: "Peak markers table" - Title: "Peak markers" + Title: "Peak markers (filtered query cells)" ucsc_cb_html_data: type: Directory? @@ -693,6 +836,8 @@ steps: reference_data_rds: reference_data_rds reference_data_index: reference_data_index reference_source_column: reference_source_column + minimum_confidence_score: minimum_confidence_score + minimum_mapping_score: minimum_mapping_score atac_fragments_file: atac_fragments_file genes_of_interest: source: genes_of_interest @@ -732,36 +877,43 @@ steps: source: threads valueFrom: $(parseInt(self)) out: + - ref_cell_cnts_gr_ctyp_plot_png + - ref_umap_gr_ctyp_plot_png - cell_cnts_gr_ctyp_plot_png - - umap_qc_mtrcs_plot_png - - gene_umi_spl_ctyp_plot_png - - umi_mito_spl_ctyp_plot_png + - qc_mtrcs_dnst_gr_ctyp_plot_png + - umap_cnf_plot_png + - umap_map_plot_png + - gene_umi_gr_cnf_spl_ctyp_plot_png + - gene_umi_gr_map_spl_ctyp_plot_png + - umi_mito_gr_cnf_spl_ctyp_plot_png + - umi_mito_gr_map_spl_ctyp_plot_png + - tss_frgm_gr_cnf_spl_ctyp_plot_png + - tss_frgm_gr_map_spl_ctyp_plot_png + - rna_atac_cnts_gr_cnf_spl_ctyp_plot_png + - rna_atac_cnts_gr_map_spl_ctyp_plot_png - rnadbl_gr_ctyp_plot_png - - tss_frgm_spl_ctyp_plot_png - atacdbl_gr_ctyp_plot_png - - rna_atac_cnts_spl_ctyp_plot_png - vrlpdbl_gr_ctyp_plot_png - - qc_mtrcs_dnst_gr_ctyp_plot_png - umap_gr_ctyp_plot_png + - umap_gr_ctyp_spl_ph_png + - cmp_gr_ph_spl_ctyp_png - umap_gr_ctyp_spl_idnt_plot_png - cmp_gr_ctyp_spl_idnt_plot_png - umap_gr_ph_spl_idnt_plot_png - cmp_gr_ph_spl_idnt_plot_png - - umap_gr_ctyp_spl_ph_png - - cmp_gr_ph_spl_ctyp_png - umap_gr_ctyp_spl_cnd_plot_png - cmp_gr_ctyp_spl_cnd_plot_png - umap_gr_ph_spl_cnd_plot_png - cmp_gr_ph_spl_cnd_plot_png - xpr_avg_plot_png - - xpr_per_cell_plot_png - xpr_dnst_plot_png - - xpr_htmp_plot_png + - xpr_per_cell_plot_png - cvrg_plot_png - - all_plots_pdf + - xpr_htmp_plot_png - xpr_htmp_tsv - gene_markers_tsv - peak_markers_tsv + - all_plots_pdf - ucsc_cb_html_data - ucsc_cb_html_file - seurat_data_rds From 71c52a7242ea23ff4165728b19c3b9caddd72a30 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 23 Sep 2024 11:37:24 -0400 Subject: [PATCH 156/162] Add a pipeline to load data from BD Rhapsody --- tools/sc-rna-load-rhapsody.cwl | 270 +++++++++++++++++++++++++++++ workflows/sc-rna-azimuth.cwl | 1 + workflows/sc-rna-filter.cwl | 1 + workflows/sc-rna-load-rhapsody.cwl | 225 ++++++++++++++++++++++++ 4 files changed, 497 insertions(+) create mode 100644 tools/sc-rna-load-rhapsody.cwl create mode 100644 workflows/sc-rna-load-rhapsody.cwl diff --git a/tools/sc-rna-load-rhapsody.cwl b/tools/sc-rna-load-rhapsody.cwl new file mode 100644 index 00000000..0f0ac947 --- /dev/null +++ b/tools/sc-rna-load-rhapsody.cwl @@ -0,0 +1,270 @@ +cwlVersion: v1.0 +class: CommandLineTool + + +requirements: +- class: InlineJavascriptRequirement +- class: EnvVarRequirement + envDef: + R_MAX_VSIZE: $((inputs.vector_memory_limit * 1000000000).toString()) + + +hints: +- class: DockerRequirement + dockerPull: biowardrobe2/sc-tools:v0.0.41 + + +inputs: + + query_data_rds: + type: + - File + - type: array + items: File + inputBinding: + prefix: "--rds" + doc: | + Path to the RDS file(s) to load Seurat object(s) + from. These files should be generated by the BD + Rhapsody Sequence Analysis Pipeline and include + Sample_Tag metadata column. + + sample_tags_metadata: + type: File + inputBinding: + prefix: "--metadata" + doc: | + Path to the TSV/CSV file to assign names to the + sample tags. This file must include exactly two + columns. First column is a 'sample_tag'. It should + correspond to all unique values from the 'Sample_Tag' + column of the loaded Seurat object(s). Second column + may have an arbitrary name. But it should include + unique names for each sample tag. + + split_by_origin: + type: boolean? + inputBinding: + prefix: "--split" + doc: | + When assigning names to the sample tags, split + each of them by origin (the RDS file the data + was loaded from). + Default: do not split + + verbose: + type: boolean? + inputBinding: + prefix: "--verbose" + doc: | + Print debug information. + Default: false + + export_h5ad_data: + type: boolean? + inputBinding: + prefix: "--h5ad" + doc: | + Save raw counts from the RNA assay to h5ad file. + Default: false + + export_html_report: + type: boolean? + default: false + doc: | + Export tehcnical report. HTML format. + Note, stdout will be less informative. + Default: false + + output_prefix: + type: string? + inputBinding: + prefix: "--output" + doc: | + Output prefix. + Default: ./sc + + parallel_memory_limit: + type: int? + inputBinding: + prefix: "--memory" + doc: | + Maximum memory in GB allowed to be shared between the workers + when using multiple --cpus. + Default: 32 + + vector_memory_limit: + type: int? + default: 128 + doc: | + Maximum vector memory in GB allowed to be used by R. + Default: 128 + + threads: + type: int? + inputBinding: + prefix: "--cpus" + doc: | + Number of cores/cpus to use. + Default: 1 + + seed: + type: int? + inputBinding: + prefix: "--seed" + doc: | + Seed number for random values. + Default: 42 + + +outputs: + + feature_bc_matrices_folder: + type: File + outputBinding: + glob: "*_bc_matrix.tar.gz" + doc: | + Compressed folder with the merged feature-barcode + matrix from the loaded RDS files produced by the + BD Rhapsody Sequence Analysis Pipeline. + MEX format (TAR-gzipped). + + aggregation_metadata: + type: File + outputBinding: + glob: "*_aggr.tsv" + doc: | + Aggregation metadata file with names assigned + to sample tags. The row order corresponds to + the numeric suffixes added to cell barcodes in + the merged feature-barcode matrix. + TSV format. + + seurat_data_rds: + type: File + outputBinding: + glob: "*_data.rds" + doc: | + Seurat object. + RDS format. + + seurat_data_h5ad: + type: File? + outputBinding: + glob: "*_counts.h5ad" + doc: | + Seurat object. + H5AD format. + + sc_report_html_file: + type: File? + outputBinding: + glob: "sc_report.html" + doc: | + Tehcnical report. + HTML format. + + stdout_log: + type: stdout + + stderr_log: + type: stderr + + +baseCommand: ["Rscript"] +arguments: +- valueFrom: $(inputs.export_html_report?["/usr/local/bin/sc_report_wrapper.R", "/usr/local/bin/sc_rna_load_rhapsody.R"]:"/usr/local/bin/sc_rna_load_rhapsody.R") + +stdout: sc_rna_load_rhapsody_stdout.log +stderr: sc_rna_load_rhapsody_stderr.log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + + +label: "Single-cell RNA-Seq BD Rhapsody Import" +s:name: "Single-cell RNA-Seq BD Rhapsody Import" +s:alternateName: "Single-cell RNA-Seq BD Rhapsody Import" + +s:downloadUrl: https://raw.githubusercontent.com/Barski-lab/workflows/master/tools/sc-rna-load-rhapsody.cwl +s:codeRepository: https://github.com/Barski-lab/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell RNA-Seq BD Rhapsody Import + + Imports RDS files produced by the BD Rhapsody + Sequence Analysis Pipeline. Assigns names to the + sample tags based on the provided metadata file. + Exports results into compressed feature barcode + matrix, RDS and h5ad files. + + +s:about: | + usage: /usr/local/bin/sc_rna_load_rhapsody.R [-h] --rds RDS [RDS ...] + --metadata METADATA [--split] + [--verbose] [--h5ad] + [--output OUTPUT] [--cpus CPUS] + [--memory MEMORY] [--seed SEED] + + Single-cell RNA-Seq BD Rhapsody Import + + optional arguments: + -h, --help show this help message and exit + --rds RDS [RDS ...] Path to the RDS file(s) to load Seurat object(s) from. + These files should be generated by the BD Rhapsody + Sequence Analysis Pipeline and include Sample_Tag + metadata column. + --metadata METADATA Path to the TSV/CSV file to assign names to the sample + tags. This file must include exactly two columns. First + column is a 'sample_tag'. It should correspond to all + unique values from the 'Sample_Tag' column of the + loaded Seurat object(s). Second column may have an + arbitrary name. But it should include unique names for + each sample tag. + --split When assigning names to the sample tags, split each of + them by sample (the RDS file the data was loaded from). + Default: do not split + --verbose Print debug information. Default: false + --h5ad Save raw counts from the RNA assay to h5ad file. + Default: false + --output OUTPUT Output prefix. Default: ./sc + --cpus CPUS Number of cores/cpus to use. Default: 1 + --memory MEMORY Maximum memory in GB allowed to be shared between the + workers when using multiple --cpus. Default: 32 + --seed SEED Seed number for random values. Default: 42 \ No newline at end of file diff --git a/workflows/sc-rna-azimuth.cwl b/workflows/sc-rna-azimuth.cwl index 33b79fc2..62d6af3f 100644 --- a/workflows/sc-rna-azimuth.cwl +++ b/workflows/sc-rna-azimuth.cwl @@ -27,6 +27,7 @@ requirements: - "sc-atac-reduce.cwl" - "sc-atac-cluster.cwl" - "sc-ctype-assign.cwl" + - "sc-rna-load-rhapsody.cwl" sc_atac_sample: - "cellranger-arc-count.cwl" - "cellranger-arc-aggr.cwl" diff --git a/workflows/sc-rna-filter.cwl b/workflows/sc-rna-filter.cwl index 2d9f356d..601073d2 100644 --- a/workflows/sc-rna-filter.cwl +++ b/workflows/sc-rna-filter.cwl @@ -20,6 +20,7 @@ requirements: - "single-cell-preprocess-cellranger.cwl" - "cellranger-multi.cwl" - "sc-format-transform.cwl" + - "sc-rna-load-rhapsody.cwl" inputs: diff --git a/workflows/sc-rna-load-rhapsody.cwl b/workflows/sc-rna-load-rhapsody.cwl new file mode 100644 index 00000000..639e078f --- /dev/null +++ b/workflows/sc-rna-load-rhapsody.cwl @@ -0,0 +1,225 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: +- class: SubworkflowFeatureRequirement +- class: StepInputExpressionRequirement +- class: InlineJavascriptRequirement +- class: MultipleInputFeatureRequirement + + +inputs: + + alias: + type: string + label: "Analysis name" + sd:preview: + position: 1 + + query_data_rds: + type: + - File + - type: array + items: File + label: "RDS file(s) produced by BD Rhapsody Sequence Analysis Pipeline" + doc: | + Path to the RDS file(s) to load Seurat object(s) + from. These files should be generated by the BD + Rhapsody Sequence Analysis Pipeline and include + Sample_Tag metadata column. + + sample_tags_metadata: + type: File + label: "Sample tags metadata file (first column should be sample_tag)" + doc: | + Path to the TSV/CSV file to assign names to the + sample tags. This file must include exactly two + columns. First column is a 'sample_tag'. It should + correspond to all unique values from the 'Sample_Tag' + column of the loaded Seurat object(s). Second column + may have an arbitrary name. But it should include + unique names for each sample tag. + + split_by_origin: + type: boolean? + default: false + label: "Split each sample tag by origin" + doc: | + When assigning names to the sample tags, split + each of them by origin (the RDS file the data + was loaded from). + Default: do not split + "sd:layout": + advanced: true + + export_html_report: + type: boolean? + default: true + label: "Show HTML report" + doc: | + Export tehcnical report in HTML format. + Default: true + "sd:layout": + advanced: true + + threads: + type: + - "null" + - type: enum + symbols: + - "1" + - "2" + - "3" + - "4" + - "5" + - "6" + default: "4" + label: "Cores/CPUs" + doc: | + Parallelization parameter to define the + number of cores/CPUs that can be utilized + simultaneously. + Default: 4 + "sd:layout": + advanced: true + + +outputs: + + filtered_feature_bc_matrix_folder: + type: File + outputSource: sc_rna_load_rhapsody/feature_bc_matrices_folder + label: "TAR-gzipped folder with the feature-barcode matrix in MEX format" + doc: | + Compressed folder with the merged feature-barcode + matrix from the loaded RDS files produced by the + BD Rhapsody Sequence Analysis Pipeline. + MEX format (TAR-gzipped). + + aggregation_metadata: + type: File? + outputSource: sc_rna_load_rhapsody/aggregation_metadata + label: "Aggregation metadata in TSV format" + doc: | + Aggregation metadata file with names assigned + to sample tags. The row order corresponds to + the numeric suffixes added to cell barcodes in + the merged feature-barcode matrix. + TSV format. + + seurat_data_rds: + type: File + outputSource: sc_rna_load_rhapsody/seurat_data_rds + label: "Seurat object in RDS format" + doc: | + Seurat object. + RDS format. + + sc_report_html_file: + type: File? + outputSource: sc_rna_load_rhapsody/sc_report_html_file + label: "Analysis log" + doc: | + Tehcnical report. + HTML format. + "sd:visualPlugins": + - linkList: + tab: "Overview" + target: "_blank" + + sc_rna_load_rhapsody_stdout_log: + type: File + outputSource: sc_rna_load_rhapsody/stdout_log + label: "Output log" + doc: | + Stdout log from the sc_rna_load_rhapsody step. + + sc_rna_load_rhapsody_stderr_log: + type: File + outputSource: sc_rna_load_rhapsody/stderr_log + label: "Error log" + doc: | + Stderr log from the sc_rna_load_rhapsody step. + + +steps: + + sc_rna_load_rhapsody: + run: ../tools/sc-rna-load-rhapsody.cwl + in: + query_data_rds: query_data_rds + sample_tags_metadata: sample_tags_metadata + split_by_origin: split_by_origin + export_html_report: export_html_report + verbose: + default: true + parallel_memory_limit: + default: 32 + vector_memory_limit: + default: 128 + threads: + source: threads + valueFrom: $(parseInt(self)) + out: + - feature_bc_matrices_folder + - aggregation_metadata + - seurat_data_rds + - sc_report_html_file + - stdout_log + - stderr_log + + +$namespaces: + s: http://schema.org/ + +$schemas: +- https://github.com/schemaorg/schemaorg/raw/main/data/releases/11.01/schemaorg-current-http.rdf + +label: "Single-cell RNA-Seq BD Rhapsody Import" +s:name: "Single-cell RNA-Seq BD Rhapsody Import" +s:alternateName: "Single-cell RNA-Seq BD Rhapsody Import" + +s:downloadUrl: https://raw.githubusercontent.com/datirium/workflows/master/workflows/sc-rna-load-rhapsody.cwl +s:codeRepository: https://github.com/datirium/workflows +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + + +doc: | + Single-cell RNA-Seq BD Rhapsody Import + + Imports RDS files produced by the BD Rhapsody + Sequence Analysis Pipeline. Assigns names to the + sample tags based on the provided metadata file. + Exports results into compressed feature barcode + matrix and RDS files. From 8aa19997fb904ab68859430105f62b8a1eb04749 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Tue, 1 Oct 2024 13:20:06 -0400 Subject: [PATCH 157/162] Export Azimuth reference model from the sc ctype assign pipeline --- tools/sc-ctype-assign.cwl | 39 +++++++- workflows/sc-ctype-assign.cwl | 23 +++++ workflows/sc-rna-azimuth.cwl | 165 +++++++++++++++++++++++++--------- 3 files changed, 182 insertions(+), 45 deletions(-) diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index 6c58b24f..18f3e5db 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -301,6 +301,17 @@ inputs: have RNA assay this parameter will be ignored. Default: false + export_azimuth_ref: + type: boolean? + inputBinding: + prefix: "--azimuth" + doc: | + Save Seurat object with the assigned cell + types as model for the reference mapping + in Azimuth. Both RDS and annoy index files + will be created. + Default: false + export_ucsc_cb: type: boolean? inputBinding: @@ -700,7 +711,7 @@ outputs: seurat_data_rds: type: File outputBinding: - glob: "*_data.rds" + glob: "*[!_ref]_data.rds" doc: | Seurat object. RDS format. @@ -749,6 +760,25 @@ outputs: SCope compatible. Loom format. + reference_data_rds: + type: File? + outputBinding: + glob: "*_ref_data.rds" + doc: | + Seurat object with assigned cell + types formatted as an Azimuth + reference model. + RDS format. + + reference_data_index: + type: File? + outputBinding: + glob: "*_ref_data.annoy" + doc: | + Annoy index generated for the + Azimuth reference model. + Annoy format. + sc_report_html_file: type: File? outputBinding: @@ -842,8 +872,8 @@ s:about: | [--upstream UPSTREAM] [--downstream DOWNSTREAM] [--pdf] [--verbose] [--h5seurat] [--h5ad] - [--cbbuild] [--scope] - [--output OUTPUT] + [--loupe] [--azimuth] [--cbbuild] + [--scope] [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] [--seed SEED] @@ -948,6 +978,9 @@ s:about: | enabling this feature you accept the End-User License Agreement available at https://10xgen.com/EULA. Default: false + --azimuth Save Seurat object with the assigned cell types as + model for the reference mapping in Azimuth. Both RDS + and annoy index files will be created. Default: false --cbbuild Export results to UCSC Cell Browser. Default: false --scope Save Seurat data to SCope compatible loom file. Only not normalized raw counts from the RNA assay will be diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index 806c02f2..e81fd12b 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -661,6 +661,25 @@ outputs: SCope compatible. Loom format. + reference_data_rds: + type: File? + outputSource: ctype_assign/reference_data_rds + label: "Seurat object formatted as an Azimuth reference model" + doc: | + Seurat object with assigned cell + types formatted as an Azimuth + reference model. + RDS format. + + reference_data_index: + type: File? + outputSource: ctype_assign/reference_data_index + label: "Annoy index for the Azimuth reference model" + doc: | + Annoy index generated for the + Azimuth reference model. + Annoy format. + seurat_rna_data_cloupe: type: File? outputSource: ctype_assign/seurat_rna_data_cloupe @@ -763,6 +782,8 @@ steps: default: LR verbose: default: true + export_azimuth_ref: + default: true export_ucsc_cb: default: true export_scope_data: @@ -813,6 +834,8 @@ steps: - ucsc_cb_html_file - seurat_data_rds - seurat_data_scope + - reference_data_rds + - reference_data_index - seurat_rna_data_cloupe - sc_report_html_file - stdout_log diff --git a/workflows/sc-rna-azimuth.cwl b/workflows/sc-rna-azimuth.cwl index 62d6af3f..5aaac260 100644 --- a/workflows/sc-rna-azimuth.cwl +++ b/workflows/sc-rna-azimuth.cwl @@ -15,6 +15,15 @@ requirements: var splitted_line = line?line.split(/[\s,]+/).filter(get_unique):null; return (splitted_line && !!splitted_line.length)?splitted_line:null; }; + - var get_source_column = function(prefix, reduction, resolution) { + if (reduction == "RNA" && resolution != null) { + return prefix + "rna_res." + resolution; + } else if (reduction == "WNN" && resolution != null) { + return prefix + "wsnn_res." + resolution; + } else { + return null; + } + }; "sd:upstream": @@ -31,6 +40,8 @@ requirements: sc_atac_sample: - "cellranger-arc-count.cwl" - "cellranger-arc-aggr.cwl" + sc_reference_model: + - "sc-ctype-assign.cwl" inputs: @@ -69,12 +80,58 @@ inputs: "sd:upstreamSource": "sc_atac_sample/atac_fragments_file" "sd:localLabel": true - reference_source_column: - type: string - label: "Reference Seurat Object annotation column" + reference_data_rds: + type: File? + label: "Reference Single-cell Analysis with Assigned Cell Types (for a custom reference attach files below)" doc: | - Column from the metadata of the reference Seurat - object to select the reference annotations. + Analysis that includes single-cell + RNA-Seq datasets run through the + "Single-Cell Manual Cell Type + Assignment" pipeline based on the + RNA or WNN clustering results. + "sd:upstreamSource": "sc_reference_model/reference_data_rds" + "sd:localLabel": true + + reference_data_index: + type: File? + "sd:upstreamSource": "sc_reference_model/reference_data_index" + + query_reduction: + type: + - "null" + - type: enum + symbols: + - "RNA" + - "ATAC" + - "WNN" + "sd:upstreamSource": "sc_reference_model/query_reduction" + + query_resolution: + type: float? + "sd:upstreamSource": "sc_reference_model/query_resolution" + + custom_reference_data_rds: + type: File? + label: "Custom reference Seurat Object (optional)" + doc: | + RDS file to load the reference Seurat object from. + This file can be downloaded as ref.Rds from the + https://azimuth.hubmapconsortium.org/references/ + + custom_reference_data_index: + type: File? + label: "Custom reference Annoy Index (optional)" + doc: | + Annoy index file for the provided reference RDS file. + This file can be downloaded as idx.annoy from the + https://azimuth.hubmapconsortium.org/references/ + + custom_reference_source_column: + type: string? + label: "Custom reference annotation column to select cell types (optional)" + doc: | + Column from the metadata of the custom reference + Seurat object to select the reference annotations. minimum_confidence_score: type: float? @@ -160,22 +217,6 @@ inputs: (optional)" input is not provided. Default: None - reference_data_rds: - type: File - label: "Reference Seurat Object (ref.Rds) file" - doc: | - RDS file to load the reference Seurat object from. - This file can be downloaded as ref.Rds from the - https://azimuth.hubmapconsortium.org/references/ - - reference_data_index: - type: File - label: "Reference Annoy Index (idx.annoy) file" - doc: | - Annoy index file for the provided reference RDS file. - This file can be downloaded as idx.annoy from the - https://azimuth.hubmapconsortium.org/references/ - export_loupe_data: type: boolean? default: false @@ -834,9 +875,48 @@ steps: run: ../tools/sc-rna-azimuth.cwl in: query_data_rds: query_data_rds - reference_data_rds: reference_data_rds - reference_data_index: reference_data_index - reference_source_column: reference_source_column + reference_data_rds: + source: [reference_data_rds, custom_reference_data_rds, custom_reference_data_index, custom_reference_source_column] + valueFrom: | + ${ + if ( + self[1] != null && self[1].class == "File" && + self[2] != null && self[2].class == "File" && + self[3] != null + ){ + return self[1]; + } else { + return self[0]; + } + } + reference_data_index: + source: [reference_data_index, custom_reference_data_rds, custom_reference_data_index, custom_reference_source_column] + valueFrom: | + ${ + if ( + self[1] != null && self[1].class == "File" && + self[2] != null && self[2].class == "File" && + self[3] != null + ){ + return self[2]; + } else { + return self[0]; + } + } + reference_source_column: + source: [query_reduction, query_resolution, custom_reference_data_rds, custom_reference_data_index, custom_reference_source_column] + valueFrom: | + ${ + if ( + self[2] != null && self[2].class == "File" && + self[3] != null && self[3].class == "File" && + self[4] != null + ){ + return self[4]; + } else { + return get_source_column("custom_", self[0], self[1]); + } + } minimum_confidence_score: minimum_confidence_score minimum_mapping_score: minimum_mapping_score atac_fragments_file: atac_fragments_file @@ -992,21 +1072,22 @@ s:creator: doc: | Single-Cell RNA-Seq Reference Mapping - Assigns identities to cells based on the reference annotation - using the Azimuth R package. Reference models can be downloaded - from the https://azimuth.hubmapconsortium.org/ website. This - workflow can be run with the outputs of the following pipelines: - "Single-Cell RNA-Seq Filtering Analysis", "Single-Cell Multiome - ATAC-Seq and RNA-Seq Filtering Analysis", "Single-Cell RNA-Seq - Dimensionality Reduction Analysis", "Single-Cell RNA-Seq Cluster - Analysis", and "Single-Cell WNN Cluster Analysis". It can also be - used with the outputs of: "Single-Cell ATAC-Seq Dimensionality - Reduction Analysis", "Single-Cell ATAC-Seq Cluster Analysis", - "Single-Cell Manual Cell Type Assignment" pipelines if these were - part of the multiome data analysis. The results of this workflow - are compatible with any single cell pipeline normally used after - the "Single-Cell RNA-Seq Filtering Analysis" or "Single-Cell - Multiome ATAC-Seq and RNA-Seq Filtering Analysis" pipelines, - depending on the preceding analysis step. In other words, this - pipeline predicts cell types for high-quality cells without - impacting subsequent data analysis steps. \ No newline at end of file + Uses Azimuth R package to assign identities to cells based on the + reference annotation from the results of the "Single-Cell Manual + Cell Type Assignment" pipeline. Alternatively, custom reference + models can be downloaded from the + https://azimuth.hubmapconsortium.org/ website. This workflow can + be run with the outputs of the following pipelines: "Single-Cell + RNA-Seq Filtering Analysis", "Single-Cell Multiome ATAC-Seq and + RNA-Seq Filtering Analysis", "Single-Cell RNA-Seq Dimensionality + Reduction Analysis", "Single-Cell RNA-Seq Cluster Analysis", and + "Single-Cell WNN Cluster Analysis". It can also be used with the + outputs of: "Single-Cell ATAC-Seq Dimensionality Reduction Analysis", + "Single-Cell ATAC-Seq Cluster Analysis", "Single-Cell Manual Cell + Type Assignment" pipelines if these were part of the multiome data + analysis. The results of this workflow are compatible with any + single cell pipeline normally used after the "Single-Cell RNA-Seq + Filtering Analysis" or "Single-Cell Multiome ATAC-Seq and RNA-Seq + Filtering Analysis" pipelines, depending on the preceding analysis + step. In other words, this pipeline predicts cell types for + high-quality cells without impacting subsequent data analysis steps. \ No newline at end of file From 303a92cef458b9745bc77fc6d105257a499def8e Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 3 Oct 2024 11:19:26 -0400 Subject: [PATCH 158/162] Rename outputs in fastq-dump tool to correspond to what Robert has in a workflow --- tools/fastq-dump.cwl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/fastq-dump.cwl b/tools/fastq-dump.cwl index feaf5100..2d1775c4 100644 --- a/tools/fastq-dump.cwl +++ b/tools/fastq-dump.cwl @@ -168,10 +168,10 @@ outputs: return (!!splitted_line.length)?splitted_line:null; } - stdout_log: + log_stdout: type: stdout - stderr_log: + log_stderr: type: stderr From 9ec9b08a102e1507f0c4c70c5d9635d1a3620623 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Thu, 3 Oct 2024 11:39:34 -0400 Subject: [PATCH 159/162] Need to have RNA as a default reduction in sc rna azimuth, because when it's selected from the upstream ctype experiment and that input was not provided, null will be inherited. --- workflows/sc-rna-azimuth.cwl | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/sc-rna-azimuth.cwl b/workflows/sc-rna-azimuth.cwl index 5aaac260..e3426d17 100644 --- a/workflows/sc-rna-azimuth.cwl +++ b/workflows/sc-rna-azimuth.cwl @@ -104,6 +104,7 @@ inputs: - "RNA" - "ATAC" - "WNN" + default: "RNA" "sd:upstreamSource": "sc_reference_model/query_reduction" query_resolution: From 3e2842c78ede52001837a2d3cf6aa16379c7fc68 Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 4 Oct 2024 14:11:03 -0400 Subject: [PATCH 160/162] Patch cellranger arc image so it won't fail on NFS --- tools/cellranger-arc-aggr.cwl | 2 +- tools/cellranger-arc-count.cwl | 2 +- tools/cellranger-arc-mkref.cwl | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/cellranger-arc-aggr.cwl b/tools/cellranger-arc-aggr.cwl index 6f6e82f5..2bed0347 100644 --- a/tools/cellranger-arc-aggr.cwl +++ b/tools/cellranger-arc-aggr.cwl @@ -34,7 +34,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: cumulusprod/cellranger-arc:2.0.2 + dockerPull: biowardrobe2/cellranger-arc:v0.0.1 inputs: diff --git a/tools/cellranger-arc-count.cwl b/tools/cellranger-arc-count.cwl index 3d8a18fa..e1ef6b58 100644 --- a/tools/cellranger-arc-count.cwl +++ b/tools/cellranger-arc-count.cwl @@ -73,7 +73,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: cumulusprod/cellranger-arc:2.0.2 + dockerPull: biowardrobe2/cellranger-arc:v0.0.1 inputs: diff --git a/tools/cellranger-arc-mkref.cwl b/tools/cellranger-arc-mkref.cwl index 2f9d4594..d4f3179d 100644 --- a/tools/cellranger-arc-mkref.cwl +++ b/tools/cellranger-arc-mkref.cwl @@ -36,7 +36,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: cumulusprod/cellranger-arc:2.0.2 + dockerPull: biowardrobe2/cellranger-arc:v0.0.1 inputs: From e6cdbb44cea770731ca8d3b55c349f996497e75e Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Fri, 11 Oct 2024 15:55:24 -0400 Subject: [PATCH 161/162] Add geneset analysis to the clustering piplines --- tools/sc-ctype-assign.cwl | 44 ++++++++++++ tools/sc-rna-azimuth.cwl | 48 ++++++++++++- tools/sc-rna-cluster.cwl | 58 ++++++++++++++-- tools/sc-wnn-cluster.cwl | 123 ++++++++++++++++++---------------- workflows/sc-ctype-assign.cwl | 51 ++++++++++++++ workflows/sc-rna-azimuth.cwl | 54 +++++++++++++++ workflows/sc-rna-cluster.cwl | 58 ++++++++++++++++ workflows/sc-wnn-cluster.cwl | 75 +++++++++++++++++++++ 8 files changed, 448 insertions(+), 63 deletions(-) diff --git a/tools/sc-ctype-assign.cwl b/tools/sc-ctype-assign.cwl index 18f3e5db..ff82e4b4 100644 --- a/tools/sc-ctype-assign.cwl +++ b/tools/sc-ctype-assign.cwl @@ -209,6 +209,18 @@ inputs: file should be provided. Default: None + genesets_data: + type: File? + inputBinding: + prefix: "--genesets" + doc: | + Path to the GMT file for calculating average expression levels + (module scores) per gene set. This file can be downloaded from + the Molecular Signatures Database (MSigDB) following the link + https://www.gsea-msigdb.org/gsea/msigdb. To calculate module + scores the loaded Seurat object should include RNA assay. + Default: do not calculate gene set expression scores. + cvrg_upstream_bp: type: int? inputBinding: @@ -590,6 +602,30 @@ outputs: the smallest group. PNG format. + gse_per_cell_plot_png: + type: File? + outputBinding: + glob: "*_gse_per_cell.png" + doc: | + UMAP colored by gene set expression score. + PNG format. + + gse_avg_plot_png: + type: File? + outputBinding: + glob: "*_gse_avg.png" + doc: | + Average gene set expression score. + PNG format. + + gse_dnst_plot_png: + type: File? + outputBinding: + glob: "*_gse_dnst.png" + doc: | + Gene set expression score density. + PNG format. + xpr_avg_plot_png: type: File? outputBinding: @@ -869,6 +905,7 @@ s:about: | [--atactestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] [--fragments FRAGMENTS] [--genes [GENES [GENES ...]]] + [--genesets GENESETS] [--upstream UPSTREAM] [--downstream DOWNSTREAM] [--pdf] [--verbose] [--h5seurat] [--h5ad] @@ -962,6 +999,13 @@ s:about: | frequency plots for the nearest peaks the loaded Seurat object should include ATAC assay as well as the --fragments file should be provided. Default: None + --genesets GENESETS Path to the GMT file for calculating average + expression levels (module scores) per gene set. This + file can be downloaded from the Molecular Signatures + Database (MSigDB) following the link https://www.gsea- + msigdb.org/gsea/msigdb. To calculate module scores the + loaded Seurat object should include RNA assay. + Default: do not calculate gene set expression scores. --upstream UPSTREAM Number of bases to extend the genome coverage region for a specific gene upstream. Ignored if --genes or --fragments parameters are not provided. Default: 2500 diff --git a/tools/sc-rna-azimuth.cwl b/tools/sc-rna-azimuth.cwl index 5bb33b7d..aba45c7a 100644 --- a/tools/sc-rna-azimuth.cwl +++ b/tools/sc-rna-azimuth.cwl @@ -234,6 +234,17 @@ inputs: and the --fragments file should be provided. Default: None + genesets_data: + type: File? + inputBinding: + prefix: "--genesets" + doc: | + Path to the GMT file for calculating average expression levels + (module scores) per gene set. This file can be downloaded from + the Molecular Signatures Database (MSigDB) following the link + https://www.gsea-msigdb.org/gsea/msigdb. + Default: do not calculate gene set expression scores. + cvrg_upstream_bp: type: int? inputBinding: @@ -694,6 +705,34 @@ outputs: downsampled to the smallest group. PNG format. + gse_per_cell_plot_png: + type: File? + outputBinding: + glob: "*_gse_per_cell.png" + doc: | + Projected UMAP colored by + gene set expression score. + All query cells. + PNG format. + + gse_avg_plot_png: + type: File? + outputBinding: + glob: "*_gse_avg.png" + doc: | + Average gene set expression score. + All query cells. + PNG format. + + gse_dnst_plot_png: + type: File? + outputBinding: + glob: "*_gse_dnst.png" + doc: | + Gene set expression score density. + All query cells. + PNG format. + xpr_avg_plot_png: type: File? outputBinding: @@ -966,6 +1005,7 @@ s:about: | [--atactestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] [--fragments FRAGMENTS] [--genes [GENES [GENES ...]]] + [--genesets GENESETS] [--upstream UPSTREAM] [--downstream DOWNSTREAM] [--pdf] [--verbose] [--h5seurat] [--h5ad] @@ -1007,7 +1047,7 @@ s:about: | The minimum threshold for a prediction mapping score is calculated at the cell. This metric ranges from 0 to 1 and reflects how well the unique structure of a - cell’s local neighborhood is preserved during + cell's local neighborhood is preserved during reference mapping. Only cells that meet both the minimum prediction mapping score and the minimum prediction confidence score thresholds will be @@ -1068,6 +1108,12 @@ s:about: | peaks the query Seurat object should include ATAC assay as well as the --fragments file should be provided. Default: None + --genesets GENESETS Path to the GMT file for calculating average + expression levels (module scores) per gene set. This + file can be downloaded from the Molecular Signatures + Database (MSigDB) following the link https://www.gsea- + msigdb.org/gsea/msigdb. Default: do not calculate gene + set expression scores. --upstream UPSTREAM Number of bases to extend the genome coverage region for a specific gene upstream. Ignored if --genes or --fragments parameters are not provided or when the diff --git a/tools/sc-rna-cluster.cwl b/tools/sc-rna-cluster.cwl index 1dee0a24..635bc151 100644 --- a/tools/sc-rna-cluster.cwl +++ b/tools/sc-rna-cluster.cwl @@ -90,6 +90,17 @@ inputs: Genes of interest to build genes expression plots. Default: None + genesets_data: + type: File? + inputBinding: + prefix: "--genesets" + doc: | + Path to the GMT file for calculating average expression levels + (module scores) per gene set. This file can be downloaded from + the Molecular Signatures Database (MSigDB) following the link + https://www.gsea-msigdb.org/gsea/msigdb. + Default: do not calculate gene set expression scores. + identify_diff_genes: type: boolean? inputBinding: @@ -517,6 +528,38 @@ outputs: smallest group; all resolutions. PNG format. + gse_per_cell_plot_png: + type: File? + outputBinding: + glob: "*_gse_per_cell.png" + doc: | + UMAP colored by gene set expression score. + PNG format. + + gse_avg_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_gse_avg_res_*.png" + doc: | + Average gene set expression score. + All resolutions. + PNG format. + + gse_dnst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_gse_dnst_res_*.png" + doc: | + Gene set expression score density. + All resolutions. + PNG format. + xpr_per_cell_plot_png: type: - "null" @@ -753,12 +796,13 @@ s:about: | [--algorithm {louvain,mult-louvain,slm,leiden}] [--resolution [RESOLUTION [RESOLUTION ...]]] [--genes [GENES [GENES ...]]] - [--diffgenes] [--logfc LOGFC] - [--minpct MINPCT] [--onlypos] + [--genesets GENESETS] [--diffgenes] + [--logfc LOGFC] [--minpct MINPCT] + [--onlypos] [--testuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] [--pdf] [--verbose] [--h5seurat] - [--h5ad] [--cbbuild] [--scope] - [--output OUTPUT] + [--h5ad] [--loupe] [--cbbuild] + [--scope] [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] [--seed SEED] @@ -792,6 +836,12 @@ s:about: | --genes [GENES [GENES ...]] Genes of interest to build genes expression plots. Default: None + --genesets GENESETS Path to the GMT file for calculating average + expression levels (module scores) per gene set. This + file can be downloaded from the Molecular Signatures + Database (MSigDB) following the link https://www.gsea- + msigdb.org/gsea/msigdb. Default: do not calculate gene + set expression scores. --diffgenes Identify differentially expressed genes (putative gene markers) between each pair of clusters for all resolutions. Default: false diff --git a/tools/sc-wnn-cluster.cwl b/tools/sc-wnn-cluster.cwl index 1f3c25c5..58b70710 100644 --- a/tools/sc-wnn-cluster.cwl +++ b/tools/sc-wnn-cluster.cwl @@ -81,50 +81,6 @@ inputs: Sensible values are in the range 0.001 to 0.5. Default: 0.3 - umap_neighbors: - type: int? - inputBinding: - prefix: "--uneighbors" - doc: | - Determines the number of neighboring points used in UMAP. Larger values will result - in more global structure being preserved at the loss of detailed local structure. - In general this parameter should often be in the range 5 to 50. - Default: 30 - - umap_metric: - type: - - "null" - - type: enum - symbols: - - "euclidean" - - "manhattan" - - "chebyshev" - - "minkowski" - - "canberra" - - "braycurtis" - - "mahalanobis" - - "wminkowski" - - "seuclidean" - - "cosine" - - "correlation" - - "haversine" - - "hamming" - - "jaccard" - - "dice" - - "russelrao" - - "kulsinski" - - "ll_dirichlet" - - "hellinger" - - "rogerstanimoto" - - "sokalmichener" - - "sokalsneath" - - "yule" - inputBinding: - prefix: "--umetric" - doc: | - The metric to use to compute distances in high dimensional space for UMAP. - Default: cosine - umap_method: type: - "null" @@ -176,6 +132,17 @@ inputs: plots will be built. Default: None + genesets_data: + type: File? + inputBinding: + prefix: "--genesets" + doc: | + Path to the GMT file for calculating average expression levels + (module scores) per gene set. This file can be downloaded from + the Molecular Signatures Database (MSigDB) following the link + https://www.gsea-msigdb.org/gsea/msigdb. + Default: do not calculate gene set expression scores. + cvrg_upstream_bp: type: int? inputBinding: @@ -601,6 +568,18 @@ outputs: All cells; all resolutions. PNG format + slh_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_slh_gr_clst_res_*.png" + doc: | + Silhouette scores. + All cells; all resolutions. + PNG format. + umap_gr_clst_spl_idnt_res_plot_png: type: - "null" @@ -712,6 +691,38 @@ outputs: smallest group; all resolutions. PNG format. + gse_per_cell_plot_png: + type: File? + outputBinding: + glob: "*_gse_per_cell.png" + doc: | + UMAP colored by gene set expression score. + PNG format. + + gse_avg_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_gse_avg_res_*.png" + doc: | + Average gene set expression score. + All resolutions. + PNG format. + + gse_dnst_res_plot_png: + type: + - "null" + - type: array + items: File + outputBinding: + glob: "*_gse_dnst_res_*.png" + doc: | + Gene set expression score density. + All resolutions. + PNG format. + xpr_per_cell_plot_png: type: - "null" @@ -982,12 +993,11 @@ s:about: | [--algorithm {louvain,mult-louvain,slm,leiden}] [--uspread USPREAD] [--umindist UMINDIST] - [--uneighbors UNEIGHBORS] - [--umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis,mahalanobis,wminkowski,seuclidean,cosine,correlation,haversine,hamming,jaccard,dice,russelrao,kulsinski,ll_dirichlet,hellinger,rogerstanimoto,sokalmichener,sokalsneath,yule}] [--umethod {uwot,uwot-learn,umap-learn}] [--resolution [RESOLUTION [RESOLUTION ...]]] [--fragments FRAGMENTS] [--genes [GENES [GENES ...]]] + [--genesets GENESETS] [--upstream UPSTREAM] [--downstream DOWNSTREAM] [--diffgenes] [--diffpeaks] [--rnalogfc RNALOGFC] @@ -997,8 +1007,8 @@ s:about: | [--atacminpct ATACMINPCT] [--atactestuse {wilcox,bimod,roc,t,negbinom,poisson,LR,MAST,DESeq2}] [--pdf] [--verbose] [--h5seurat] - [--h5ad] [--cbbuild] [--scope] - [--output OUTPUT] + [--h5ad] [--loupe] [--cbbuild] + [--scope] [--output OUTPUT] [--theme {gray,bw,linedraw,light,dark,minimal,classic,void}] [--cpus CPUS] [--memory MEMORY] [--seed SEED] @@ -1026,7 +1036,7 @@ s:about: | integrated with Harmony. Default: 10 --algorithm {louvain,mult-louvain,slm,leiden} Algorithm for modularity optimization when running - clustering. Default: louvain + clustering. Default: slm --uspread USPREAD The effective scale of embedded points on UMAP. In combination with '--mindist' it determines how clustered/clumped the embedded points are. Default: 1 @@ -1036,15 +1046,6 @@ s:about: | values allow the algorithm to optimise more accurately with regard to local structure. Sensible values are in the range 0.001 to 0.5. Default: 0.3 - --uneighbors UNEIGHBORS - Determines the number of neighboring points used in - UMAP. Larger values will result in more global - structure being preserved at the loss of detailed - local structure. In general this parameter should - often be in the range 5 to 50. Default: 30 - --umetric {euclidean,manhattan,chebyshev,minkowski,canberra,braycurtis,mahalanobis,wminkowski,seuclidean,cosine,correlation,haversine,hamming,jaccard,dice,russelrao,kulsinski,ll_dirichlet,hellinger,rogerstanimoto,sokalmichener,sokalsneath,yule} - The metric to use to compute distances in high - dimensional space for UMAP. Default: cosine --umethod {uwot,uwot-learn,umap-learn} UMAP implementation to run. If set to 'umap-learn' use --umetric 'correlation' Default: uwot @@ -1065,6 +1066,12 @@ s:about: | insertion frequency plots for the nearest peaks. If ' --fragments' is not provided only gene expression plots will be built. Default: None + --genesets GENESETS Path to the GMT file for calculating average + expression levels (module scores) per gene set. This + file can be downloaded from the Molecular Signatures + Database (MSigDB) following the link https://www.gsea- + msigdb.org/gsea/msigdb. Default: do not calculate gene + set expression scores. --upstream UPSTREAM Number of bases to extend the genome coverage region for a specific gene upstream. Ignored if --genes or --fragments parameters are not provided. Default: 2500 diff --git a/workflows/sc-ctype-assign.cwl b/workflows/sc-ctype-assign.cwl index e81fd12b..56616659 100644 --- a/workflows/sc-ctype-assign.cwl +++ b/workflows/sc-ctype-assign.cwl @@ -177,6 +177,17 @@ inputs: The file should have two columns named 'cluster' and 'celltype'. + genesets_data: + type: File? + label: "GMT file for calculating average expression levels per gene set (optional)" + doc: | + Path to the GMT file for calculating average expression levels + (module scores) per gene set. This file can be downloaded from + the Molecular Signatures Database (MSigDB) following the link + https://www.gsea-msigdb.org/gsea/msigdb. To calculate module + scores the loaded Seurat object should include RNA assay. + Default: do not calculate gene set expression scores. + export_loupe_data: type: boolean? default: false @@ -580,6 +591,42 @@ outputs: tab: "Genes of interest (coverage)" Caption: "ATAC fragment coverage (per gene)" + gse_per_cell_plot_png: + type: File? + outputSource: ctype_assign/gse_per_cell_plot_png + label: "UMAP colored by gene set expression score" + doc: | + UMAP colored by gene set expression score. + PNG format. + "sd:visualPlugins": + - image: + tab: "Gene sets of interest (expression)" + Caption: "UMAP colored by gene set expression score" + + gse_avg_plot_png: + type: File? + outputSource: ctype_assign/gse_avg_plot_png + label: "Average gene set expression score" + doc: | + Average gene set expression score. + PNG format. + "sd:visualPlugins": + - image: + tab: "Gene sets of interest (expression)" + Caption: "Average gene set expression score" + + gse_dnst_plot_png: + type: File? + outputSource: ctype_assign/gse_dnst_plot_png + label: "Gene set expression score density" + doc: | + Gene set expression score density. + PNG format. + "sd:visualPlugins": + - image: + tab: "Gene sets of interest (expression)" + Caption: "Gene set expression score density" + xpr_htmp_plot_png: type: File? outputSource: ctype_assign/xpr_htmp_plot_png @@ -764,6 +811,7 @@ steps: genes_of_interest: source: genes_of_interest valueFrom: $(split_features(self)) + genesets_data: genesets_data identify_diff_genes: identify_diff_genes identify_diff_peaks: identify_diff_peaks rna_minimum_logfc: @@ -821,6 +869,9 @@ steps: - cmp_gr_ctyp_spl_cnd_plot_png - umap_gr_ph_spl_cnd_plot_png - cmp_gr_ph_spl_cnd_plot_png + - gse_per_cell_plot_png + - gse_avg_plot_png + - gse_dnst_plot_png - xpr_avg_plot_png - xpr_per_cell_plot_png - xpr_dnst_plot_png diff --git a/workflows/sc-rna-azimuth.cwl b/workflows/sc-rna-azimuth.cwl index e3426d17..05e2acbc 100644 --- a/workflows/sc-rna-azimuth.cwl +++ b/workflows/sc-rna-azimuth.cwl @@ -218,6 +218,16 @@ inputs: (optional)" input is not provided. Default: None + genesets_data: + type: File? + label: "GMT file for calculating average expression levels per gene set (optional)" + doc: | + Path to the GMT file for calculating average expression levels + (module scores) per gene set. This file can be downloaded from + the Molecular Signatures Database (MSigDB) following the link + https://www.gsea-msigdb.org/gsea/msigdb. + Default: do not calculate gene set expression scores. + export_loupe_data: type: boolean? default: false @@ -742,6 +752,46 @@ outputs: tab: "Genes of interest (coverage)" Caption: "ATAC fragment coverage (per gene, filtered query cells)" + gse_per_cell_plot_png: + type: File? + outputSource: rna_azimuth/gse_per_cell_plot_png + label: "Projected UMAP colored by gene set expression score (all query cells)" + doc: | + Projected UMAP colored by + gene set expression score. + All query cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "Gene sets of interest (expression)" + Caption: "Projected UMAP colored by gene set expression score (all query cells)" + + gse_avg_plot_png: + type: File? + outputSource: rna_azimuth/gse_avg_plot_png + label: "Average gene set expression score (all query cells)" + doc: | + Average gene set expression score. + All query cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "Gene sets of interest (expression)" + Caption: "Average gene set expression score (all query cells)" + + gse_dnst_plot_png: + type: File? + outputSource: rna_azimuth/gse_dnst_plot_png + label: "Gene set expression score density (all query cells)" + doc: | + Gene set expression score density. + All query cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "Gene sets of interest (expression)" + Caption: "Gene set expression score density (all query cells)" + xpr_htmp_plot_png: type: File? outputSource: rna_azimuth/xpr_htmp_plot_png @@ -924,6 +974,7 @@ steps: genes_of_interest: source: genes_of_interest valueFrom: $(split_features(self)) + genesets_data: genesets_data identify_diff_genes: identify_diff_genes identify_diff_peaks: identify_diff_peaks rna_minimum_logfc: @@ -987,6 +1038,9 @@ steps: - cmp_gr_ctyp_spl_cnd_plot_png - umap_gr_ph_spl_cnd_plot_png - cmp_gr_ph_spl_cnd_plot_png + - gse_per_cell_plot_png + - gse_avg_plot_png + - gse_dnst_plot_png - xpr_avg_plot_png - xpr_dnst_plot_png - xpr_per_cell_plot_png diff --git a/workflows/sc-rna-cluster.cwl b/workflows/sc-rna-cluster.cwl index c3cfdc51..8dec35ca 100644 --- a/workflows/sc-rna-cluster.cwl +++ b/workflows/sc-rna-cluster.cwl @@ -129,6 +129,16 @@ inputs: of interest to visualize expression. Default: None + genesets_data: + type: File? + label: "GMT file for calculating average expression levels per gene set (optional)" + doc: | + Path to the GMT file for calculating average expression levels + (module scores) per gene set. This file can be downloaded from + the Molecular Signatures Database (MSigDB) following the link + https://www.gsea-msigdb.org/gsea/msigdb. + Default: do not calculate gene set expression scores. + export_loupe_data: type: boolean? default: false @@ -522,6 +532,50 @@ outputs: tab: "Genes of interest (expression)" Caption: "UMAP colored by gene expression (per gene)" + gse_per_cell_plot_png: + type: File? + outputSource: sc_rna_cluster/gse_per_cell_plot_png + label: "UMAP colored by gene set expression score" + doc: | + UMAP colored by gene set expression score. + PNG format. + "sd:visualPlugins": + - image: + tab: "Gene sets of interest (expression)" + Caption: "UMAP colored by gene set expression score" + + gse_avg_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/gse_avg_res_plot_png + label: "Average gene set expression score" + doc: | + Average gene set expression score. + All resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "Gene sets of interest (expression)" + Caption: "Average gene set expression score" + + gse_dnst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_rna_cluster/gse_dnst_res_plot_png + label: "Gene set expression score density" + doc: | + Gene set expression score density. + All resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "Gene sets of interest (expression)" + Caption: "Gene set expression score density" + xpr_htmp_res_plot_png: type: - "null" @@ -658,6 +712,7 @@ steps: genes_of_interest: source: genes_of_interest valueFrom: $(split_features(self)) + genesets_data: genesets_data identify_diff_genes: identify_diff_genes only_positive_diff_genes: default: true @@ -703,6 +758,9 @@ steps: - cmp_gr_ph_spl_clst_res_plot_png - umap_gr_clst_spl_cnd_res_plot_png - cmp_gr_clst_spl_cnd_res_plot_png + - gse_per_cell_plot_png + - gse_avg_res_plot_png + - gse_dnst_res_plot_png - xpr_per_cell_plot_png - xpr_avg_res_plot_png - xpr_dnst_res_plot_png diff --git a/workflows/sc-wnn-cluster.cwl b/workflows/sc-wnn-cluster.cwl index fbcea837..c90a0a17 100644 --- a/workflows/sc-wnn-cluster.cwl +++ b/workflows/sc-wnn-cluster.cwl @@ -178,6 +178,16 @@ inputs: plots will be created as well. Default: None + genesets_data: + type: File? + label: "GMT file for calculating average expression levels per gene set (optional)" + doc: | + Path to the GMT file for calculating average expression levels + (module scores) per gene set. This file can be downloaded from + the Molecular Signatures Database (MSigDB) following the link + https://www.gsea-msigdb.org/gsea/msigdb. + Default: do not calculate gene set expression scores. + export_loupe_data: type: boolean? default: false @@ -412,6 +422,22 @@ outputs: tab: "Split by cluster" Caption: "UMAP colored by cluster (all cells)" + slh_gr_clst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/slh_gr_clst_res_plot_png + label: "Silhouette scores (all cells)" + doc: | + Silhouette scores. + All cells. + PNG format. + "sd:visualPlugins": + - image: + tab: "Split by cluster" + Caption: "Silhouette scores (all cells)" + umap_gr_clst_spl_ph_res_plot_png: type: - "null" @@ -638,6 +664,50 @@ outputs: tab: "Genes of interest (coverage)" Caption: "ATAC fragment coverage (per gene)" + gse_per_cell_plot_png: + type: File? + outputSource: sc_wnn_cluster/gse_per_cell_plot_png + label: "UMAP colored by gene set expression score" + doc: | + UMAP colored by gene set expression score. + PNG format. + "sd:visualPlugins": + - image: + tab: "Gene sets of interest (expression)" + Caption: "UMAP colored by gene set expression score" + + gse_avg_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/gse_avg_res_plot_png + label: "Average gene set expression score" + doc: | + Average gene set expression score. + All resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "Gene sets of interest (expression)" + Caption: "Average gene set expression score" + + gse_dnst_res_plot_png: + type: + - "null" + - type: array + items: File + outputSource: sc_wnn_cluster/gse_dnst_res_plot_png + label: "Gene set expression score density" + doc: | + Gene set expression score density. + All resolutions. + PNG format. + "sd:visualPlugins": + - image: + tab: "Gene sets of interest (expression)" + Caption: "Gene set expression score density" + xpr_htmp_res_plot_png: type: - "null" @@ -788,6 +858,7 @@ steps: genes_of_interest: source: genes_of_interest valueFrom: $(split_features(self)) + genesets_data: genesets_data identify_diff_genes: identify_diff_genes identify_diff_peaks: identify_diff_peaks rna_minimum_logfc: @@ -837,12 +908,16 @@ steps: - umap_gr_ph_spl_cnd_plot_png - cmp_gr_ph_spl_cnd_plot_png - umap_gr_clst_res_plot_png + - slh_gr_clst_res_plot_png - umap_gr_clst_spl_idnt_res_plot_png - cmp_gr_clst_spl_idnt_res_plot_png - umap_gr_clst_spl_ph_res_plot_png - cmp_gr_ph_spl_clst_res_plot_png - umap_gr_clst_spl_cnd_res_plot_png - cmp_gr_clst_spl_cnd_res_plot_png + - gse_per_cell_plot_png + - gse_avg_res_plot_png + - gse_dnst_res_plot_png - xpr_per_cell_plot_png - xpr_avg_res_plot_png - xpr_dnst_res_plot_png From 466b02c2264d1c30256d6a53896e53549b2354bb Mon Sep 17 00:00:00 2001 From: Michael Kotliar Date: Mon, 11 Nov 2024 15:25:36 -0500 Subject: [PATCH 162/162] Update docker image for an old deseq multi factor analysis to make it not fails when there is not enough samples for some of the autoestimated contrasts --- tools/deseq-multi-factor.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/deseq-multi-factor.cwl b/tools/deseq-multi-factor.cwl index f17c704f..e836ad6e 100644 --- a/tools/deseq-multi-factor.cwl +++ b/tools/deseq-multi-factor.cwl @@ -8,7 +8,7 @@ requirements: hints: - class: DockerRequirement - dockerPull: biowardrobe2/deseq:v0.0.6 + dockerPull: biowardrobe2/deseq:v0.0.7 inputs: