diff --git a/.gitignore b/.gitignore index 5469b47..51c0647 100644 --- a/.gitignore +++ b/.gitignore @@ -16,4 +16,5 @@ eager_inputs_old/ eager_outputs_old/ array_Logs/ poseidon_packages/ -debug_tables/ \ No newline at end of file +debug_tables/ +*.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b8293d..f606be4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,25 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.5.0] - 30/09/2024 + +### `Added` + +- Processing of RM data. (Twist capture + mtDNA) + - `conf/Autorun.config`: Add RP profile for processing twist capture results. Identical to TF for now. + - `scripts/prepare_eager_tsv.R`: Add RP analysis type for twist capture results + - `scripts/run_Eager.sh`: Run for RP TSVs. + - `scripts/cron_daily_prepare.sh`: Create RP analysis TSVs daily. + - `scripts/ethical_sample_scrub.sh`: Add RP analysis type for ethical sample scrubbing. + - `scripts/clear_work_dirs.sh`: Add RP analysis type for work directory clearing. + - `scripts/clear_results.sh`: Add RP analysis type for results directory clearing. + +### `Fixed` + +### `Dependencies` + +### `Deprecated` + ## [1.4.0] - 12/07/2023 ### `Added` diff --git a/README.md b/README.md index 8b50f17..5bfb573 100644 --- a/README.md +++ b/README.md @@ -4,17 +4,19 @@ Automated nf-core/eager processing of Autorun output bams. ## Quickstart -- Run `prepare_eager_tsv.R` for human SG or TF data for a given sequencing batch: +- Run `prepare_eager_tsv.R` for human SG, TF, RP, or RM data for a given sequencing batch: ```bash prepare_eager_tsv.R -s -a SG -o eager_inputs/ -d .eva_credentials prepare_eager_tsv.R -s -a TF -o eager_inputs/ -d .eva_credentials + prepare_eager_tsv.R -s -a RP -o eager_inputs/ -d .eva_credentials + prepare_eager_tsv.R -s -a RM -o eager_inputs/ -d .eva_credentials ``` - Run eager with the following script, which then runs on the generated TSV files: ```bash - run_Eager.sh + run_Eager.sh -a ``` ⚠️ For some library preparation protocols and external libraries, UDG treatment cannot be reliably inferred, and errors will be thrown. @@ -22,7 +24,7 @@ In such cases, an eager input TSV will still be created, but UDG treatment for a ## Autorun.config -Contains the `autorun`, `SG` and `TF` profiles. +Contains the `autorun`, `local_paths`, `SG`, `TF`, `RP`, and `RM` profiles. ### autorun @@ -43,43 +45,51 @@ The standardised parameters for processing human shotgun data. The standardised parameters for processing human 1240k capture data. -## prepare_eager_tsv.R - -An R script that when given a sequencing batch ID, Autorun Analysis type and PANDORA credentials will create/update eager input TSV files for further processing. - -```bash -Usage: ./prepare_eager_tsv.R [options] .credentials +### RP -Options: - -h, --help - Show this help message and exit +The standardised parameters for processing human Twist capture data. - -s SEQUENCING_BATCH_ID, --sequencing_batch_id=SEQUENCING_BATCH_ID - The Pandora sequencing batch ID to update eager input for. A TSV file will be prepared - for each individual in this run, containing all relevant processed BAM files - from the individual +### RM - -a ANALYSIS_TYPE, --analysis_type=ANALYSIS_TYPE - The analysis type to compile the data from. Should be one of: 'SG', 'TF'. +The standardised parameters for processing human Twist+MT capture data. - -r, --rename - Changes all dots (.) in the Library_ID field of the output to underscores (_). - Some tools used in nf-core/eager will strip everything after the first dot (.) - from the name of the input file, which can cause naming conflicts in rare cases. +## prepare_eager_tsv.R - -w WHITELIST, --whitelist=WHITELIST - An optional file that includes the IDs of whitelisted individuals, - one per line. Only the TSVs for these individuals will be updated. +An R script that when given a sequencing batch ID, Autorun Analysis type and PANDORA credentials will create/update eager input TSV files for further processing. - -o OUTDIR/, --outDir=OUTDIR/ - The desired output directory. Within this directory, one subdirectory will be - created per analysis type, within that one subdirectory per individual ID, - and one TSV within each of these directory. +```bash +Usage: ./scripts/prepare_eager_tsv.R [options] .credentials - -d, --debug_output - When provided, the entire result table for the run will be saved as '.results.txt'. - Helpful to check all the output data in one place. +Options: + -h, --help + Show this help message and exit + + -s SEQUENCING_BATCH_ID, --sequencing_batch_id=SEQUENCING_BATCH_ID + The Pandora sequencing batch ID to update eager input for. A TSV file will be prepared + for each individual in this run, containing all relevant processed BAM files + from the individual + + -a ANALYSIS_TYPE, --analysis_type=ANALYSIS_TYPE + The analysis type to compile the data from. Should be one of: 'SG', 'TF', 'RP', 'RM'. + + -r, --rename + Changes all dots (.) in the Library_ID field of the output to underscores (_). + Some tools used in nf-core/eager will strip everything after the first dot (.) + from the name of the input file, which can cause naming conflicts in rare cases. + + -w WHITELIST, --whitelist=WHITELIST + An optional file that includes the IDs of whitelisted individuals, + one per line. Only the TSVs for these individuals will be updated. + + -o OUTDIR, --outDir=OUTDIR + The desired output directory. Within this directory, one subdirectory will be + created per analysis type, within that one subdirectory per individual ID, + and one TSV within each of these directory. + + -d, --debug_output + When provided, the entire result table for the run will be saved as '.results.txt'. + Helpful to check all the output data in one place. Note: a valid sidora .credentials file is required. Contact the Pandora/Sidora team for details. ``` @@ -88,13 +98,21 @@ The eager input TSVs will be created in the following directory structure, given ```text eager_inputs ├── SG -│ └──IND -│ ├── IND001 -│ └── IND002 -└── TF - └──IND - ├── IND001 - └── IND002 +│ └──ABC +│ ├── ABC001 +│ └── ABC002 +├── TF +│ └──ABC +│ ├── ABC001 +│ └── ABC002 +├── RP +│ └──ABC +│ ├── ABC001 +│ └── ABC002 +└── RM + └──ABC + ├── ABC001 + └── ABC002 ``` Alongside each created TSV is a file named `autorun_eager_version.txt`, which states the version of Autorun_eager used. @@ -112,13 +130,21 @@ The outputs are saved with the same directory structure as the inputs, but in a ```text eager_outputs ├── SG -│ └──IND -│ ├── IND001 -│ └── IND002 -└── TF - └──IND - ├── IND001 - └── IND002 +│ └──ABC +│ ├── ABC001 +│ └── ABC002 +├── TF +│ └──ABC +│ ├── ABC001 +│ └── ABC002 +├── RP +│ └──ABC +│ ├── ABC001 +│ └── ABC002 +└── RM + └──ABC + ├── ABC001 + └── ABC002 ``` This script recognises the `-a/--array` option. When this is provided, instead of running eager jobs in sequence, a temporary file is created named `$(date +'%y%m%d_%H%M')_Autorun_eager_queue.txt` that includes the command line of all eager jobs to-be-ran, one per line. An "Autorun_eager spawner" (`AE_spawner`) array job is then submitted using `qsub`, which uses a secondary script named `scripts/submit_as_array.sh` to submit the command in each line of the temporary file as a separate task. In this manner, 10 eager runs can be ran in parallel. Logs for these jobs will then be added to a directory named `array_Logs//`. diff --git a/conf/Autorun.config b/conf/Autorun.config index eba45b9..3b1d8ed 100644 --- a/conf/Autorun.config +++ b/conf/Autorun.config @@ -26,7 +26,7 @@ profiles { fasta_index = '/mnt/archgen/Reference_Genomes/Human/hs37d5/hs37d5.fa.fai' bwa_index = '/mnt/archgen/Reference_Genomes/Human/hs37d5/' seq_dict = '/mnt/archgen/Reference_Genomes/Human/hs37d5/hs37d5.dict' - + // Qualimap bedfile for on-target coverage calculation snpcapture_bed = '/mnt/archgen/Reference_Genomes/Human/hs37d5/SNPCapBEDs/1240K.pos.list_hs37d5.0based.bed' @@ -50,14 +50,14 @@ profiles { bam_mapping_quality_threshold = 0 // Keep all mapped reads bam_unmapped_type = 'bam' // Keep unmapped reads as a separate BAM file for possible future pathogen screening. bam_filter_minreadlength = 30 // Do we need to add length filtering here at all? Does Kay's pre-processing do this? - + // mtDNA to nuclear ratio run_mtnucratio = true mtnucratio_header = "MT" - + // Ignore SNP capture bed for coverage calculations in non TF data. snpcapture_bed = null - + // Bam Trimming // ssDNA libraries are left untrimmed (pileupcaller deals with damage in those) // dsDNA half-udg are clipped 2bp on either side, while non-UDG are clipper 7bp @@ -89,7 +89,7 @@ profiles { // Nuclear contamination run_nuclear_contamination = true contamination_chrom_name = 'X' - + //1240k Coverage/Depth calculation (for poseidonisation) run_bedtools_coverage = true } @@ -104,11 +104,11 @@ profiles { bam_mapping_quality_threshold = 0 // Keep all mapped reads bam_unmapped_type = 'bam' // Keep unmapped reads as a separate BAM file for possible future pathogen screening. bam_filter_minreadlength = 30 // Do we need to add length filtering here at all? Does Kay's pre-processing do this? - + // mtDNA to nuclear ratio run_mtnucratio = true mtnucratio_header = "MT" - + // Bam Trimming // ssDNA libraries are left untrimmed (pileupcaller deals with damage in those) // dsDNA half-udg are clipped 2bp on either side, while non-UDG are clipper 7bp @@ -140,7 +140,7 @@ profiles { // Nuclear contamination run_nuclear_contamination = true contamination_chrom_name = 'X' - + //1240k Coverage/Depth calculation (for poseidonisation) run_bedtools_coverage = true } @@ -155,11 +155,11 @@ profiles { bam_mapping_quality_threshold = 0 // Keep all mapped reads bam_unmapped_type = 'bam' // Keep unmapped reads as a separate BAM file for possible future pathogen screening. bam_filter_minreadlength = 30 // Do we need to add length filtering here at all? Does Kay's pre-processing do this? - + // mtDNA to nuclear ratio run_mtnucratio = true mtnucratio_header = "MT" - + // Bam Trimming // ssDNA libraries are left untrimmed (pileupcaller deals with damage in those) // dsDNA half-udg are clipped 2bp on either side, while non-UDG are clipper 7bp @@ -191,7 +191,58 @@ profiles { // Nuclear contamination run_nuclear_contamination = true contamination_chrom_name = 'X' - + + //1240k Coverage/Depth calculation (for poseidonisation) + run_bedtools_coverage = true + } + } + + // Profile with parameters for runs using the Human_RM bams as input. + // Currently identical to TF profile. Just keeps the RP data separate for comparison. + RM { + params{ + // BAM filtering + run_bam_filtering = true // Filter out unmapped reads, so barplots in MultiQC are not completely overtaken by unmapped reads. + bam_mapping_quality_threshold = 0 // Keep all mapped reads + bam_unmapped_type = 'bam' // Keep unmapped reads as a separate BAM file for possible future pathogen screening. + bam_filter_minreadlength = 30 // Do we need to add length filtering here at all? Does Kay's pre-processing do this? + + // mtDNA to nuclear ratio + run_mtnucratio = true + mtnucratio_header = "MT" + + // Bam Trimming + // ssDNA libraries are left untrimmed (pileupcaller deals with damage in those) + // dsDNA half-udg are clipped 2bp on either side, while non-UDG are clipper 7bp + run_trim_bam = true + bamutils_clip_single_stranded_half_udg_left = 0 // Set to 0 so ssDNA do not get trimmed. + bamutils_clip_single_stranded_half_udg_right = 0 // Set to 0 so ssDNA do not get trimmed. + bamutils_clip_single_stranded_none_udg_left = 0 // Set to 0 so ssDNA do not get trimmed. + bamutils_clip_single_stranded_none_udg_right = 0 // Set to 0 so ssDNA do not get trimmed. + bamutils_clip_double_stranded_half_udg_left = 2 // Trim 2 bp of either side for half-UDG libraries. + bamutils_clip_double_stranded_half_udg_right = 2 // Trim 2 bp of either side for half-UDG libraries. + // Usually for dsDNA non-UDG libraries this is between 5 and 10. I have set it to 7 arbitrarily since that was a good cutoff in my own projects so far. + bamutils_clip_double_stranded_none_udg_left = 7 // Trim 7 bp of either side for non-UDG libraries. + bamutils_clip_double_stranded_none_udg_right = 7 // Trim 7 bp of either side for non-UDG libraries. + + // Damage Calculation + damage_calculation_tool = 'mapdamage' + mapdamage_downsample = 100000 // Use 100k reads for damage calculation to lower runtime. + + // Genotyping + genotyping_source = 'trimmed' // Use trimmed bams for genotyping + run_genotyping = true + genotyping_tool = 'pileupcaller' + pileupcaller_min_map_quality = 25 // To allow for reads aligning with a mismatch, and reduce reference bias in genotypes. + pileupcaller_min_base_quality = 30 + + //Sex determination + run_sexdeterrmine = true + + // Nuclear contamination + run_nuclear_contamination = true + contamination_chrom_name = 'X' + //1240k Coverage/Depth calculation (for poseidonisation) run_bedtools_coverage = true } diff --git a/scripts/clear_results.sh b/scripts/clear_results.sh index 653ceff..a34df84 100755 --- a/scripts/clear_results.sh +++ b/scripts/clear_results.sh @@ -12,7 +12,7 @@ function Helptext() { echo -ne "This script removes all output directory contents for the provided individuals, without clearing out caching, allowing for the results to be re-published.\n This enables refreshing of result directories when changes to the input might have changes merging of libraries, thus making the directory structure inconsistent.\n\n" echo -ne "Options:\n" echo -ne "-h, --help\t\tPrint this text and exit.\n" - echo -ne "-a, --analysis_type\t\tSet the analysis type. Options: TF, SG.\n" + echo -ne "-a, --analysis_type\t\tSet the analysis type. Options: TF, SG, RP, RM.\n" } ## Print messages to stderr, optionally with colours @@ -65,8 +65,8 @@ fi if [[ ${analysis_type} == '' ]]; then errecho "No --analysis_type was provided.\n" Helptext -elif [[ ${analysis_type} != "SG" && ${analysis_type} != "TF" && ${analysis_type} != "RP" ]]; then - errecho "analysis_type must be SG, TF, or RP. You provided: ${analysis_type}\n" +elif [[ ${analysis_type} != "SG" && ${analysis_type} != "TF" && ${analysis_type} != "RP" && ${analysis_type} != "RM" ]]; then + errecho "analysis_type must be SG, TF, RP, or RM. You provided: ${analysis_type}\n" Helptext fi diff --git a/scripts/clear_work_dirs.sh b/scripts/clear_work_dirs.sh index 0efcdee..5e7a57f 100755 --- a/scripts/clear_work_dirs.sh +++ b/scripts/clear_work_dirs.sh @@ -42,7 +42,7 @@ input_iids=($(cat ${ind_id_list_fn})) for ind_id in ${input_iids[@]}; do site_id=${ind_id:0:3} ## Site id is the first three characters of the individual ID errecho -ne "Clearing work directories for ${ind_id}..." - for analysis_type in "SG" "TF" "RP"; do + for analysis_type in "SG" "TF" "RP" "RM"; do if [[ -d ${root_eager_dir}/${analysis_type}/${site_id}/${ind_id}/work ]]; then errecho -ne " ${analysis_type}..." # ls -d ${root_eager_dir}/${analysis_type}/${site_id}/${ind_id}/work diff --git a/scripts/cron_daily_prepare.sh b/scripts/cron_daily_prepare.sh index f0e49f5..b89d83f 100755 --- a/scripts/cron_daily_prepare.sh +++ b/scripts/cron_daily_prepare.sh @@ -1,7 +1,7 @@ #!/bin/bash # determine which new Results have accumulated within a day, -# prepare for processing with EAGER +# prepare for processing with nf-core/eager ## Use creation time of bams to avoid picking up changes in statistics without change in data. cd /mnt/archgen/Autorun_eager @@ -11,18 +11,25 @@ cd /mnt/archgen/Autorun_eager find /mnt/archgen/Autorun/Results/Human_1240k/2* -name '*.bam' -mtime -1 2>/dev/null | cut -f 7 -d "/" | sort -u | while read RUN ; do echo "Processing TF data from run: ${RUN}" scripts/prepare_eager_tsv.R -s $RUN -a TF -o eager_inputs/ -d .eva_credentials -done +done # Shotgun # Note: this find only checks runs starting from 2020. Silence stderr to avoid 'permission denied'. find /mnt/archgen/Autorun/Results/Human_Shotgun/2* -name '*.bam' -mtime -1 2>/dev/null | cut -f 7 -d "/" | sort -u | while read RUN ; do echo "Processing SG data from run: ${RUN}" scripts/prepare_eager_tsv.R -s $RUN -a SG -o eager_inputs/ -d .eva_credentials -done +done # Twist # Note: this find only checks runs starting from 2020. Silence stderr to avoid 'permission denied'. find /mnt/archgen/Autorun/Results/Human_RP/2* -name '*.bam' -mtime -1 2>/dev/null | cut -f 7 -d "/" | sort -u | while read RUN ; do echo "Processing RP data from run: ${RUN}" scripts/prepare_eager_tsv.R -s $RUN -a RP -o eager_inputs/ -d .eva_credentials -done +done + +# Twist + mtDNA +# Note: this find only checks runs starting from 2020. Silence stderr to avoid 'permission denied'. +find /mnt/archgen/Autorun/Results/Human_RM/2* -name '*.bam' -mtime -1 2>/dev/null | cut -f 7 -d "/" | sort -u | while read RUN ; do + echo "Processing RM data from run: ${RUN}" + scripts/prepare_eager_tsv.R -s $RUN -a RM -o eager_inputs/ -d .eva_credentials +done diff --git a/scripts/ethical_sample_scrub.sh b/scripts/ethical_sample_scrub.sh index 18769b4..68b4061 100755 --- a/scripts/ethical_sample_scrub.sh +++ b/scripts/ethical_sample_scrub.sh @@ -63,7 +63,7 @@ else ## If the individuals were flagged as sensitive AFTER processing started, both the inputs and outputs should be made inaccessible. for raw_iid in ${scrub_me[@]}; do - for analysis_type in "SG" "TF" "RP"; do + for analysis_type in "SG" "TF" "RP" "RM"; do ## EAGER_INPUTS site_id="${raw_iid:0:3}" eager_input_tsv="${root_input_dir}/${analysis_type}/${site_id}/${raw_iid}/${raw_iid}.tsv" diff --git a/scripts/prepare_eager_tsv.R b/scripts/prepare_eager_tsv.R index 7b96fa4..79c4214 100755 --- a/scripts/prepare_eager_tsv.R +++ b/scripts/prepare_eager_tsv.R @@ -20,7 +20,7 @@ require(stringr) ## Validate analysis type option input validate_analysis_type <- function(option, opt_str, value, parser) { - valid_entries <- c("TF", "SG", "RP") ## TODO comment: should this be embedded within the function? You would want to maybe update this over time no? + valid_entries <- c("TF", "SG", "RP", "RM") ## TODO comment: should this be embedded within the function? You would want to maybe update this over time no? ifelse(value %in% valid_entries, return(value), stop(call.=F, "\n[prepare_eager_tsv.R] error: Invalid analysis type: '", value, "'\nAccepted values: ", paste(valid_entries,collapse=", "),"\n\n")) } @@ -46,7 +46,7 @@ save_ind_tsv <- function(data, rename, output_dir, ...) { data %>% select(-individual.Full_Individual_Id) %>% readr::write_tsv(file=paste0(ind_dir,"/",ind_id,".tsv")) ## Output structure can be changed here. ## Print Autorun_eager version to file - AE_version <- "1.4.0" + AE_version <- "1.5.0" cat(AE_version, file=paste0(ind_dir,"/autorun_eager_version.txt"), fill=T, append = F) } @@ -57,6 +57,7 @@ autorun_names_from_analysis_type <- function(analysis_type) { analysis_type == "TF" ~ c( "HUMAN_1240K", "Human_1240k" ), analysis_type == "SG" ~ c( "HUMAN_SHOTGUN", "Human_Shotgun" ), analysis_type == "RP" ~ c( "HUMAN_RP", "Human_RP" ), + analysis_type == "RM" ~ c( "HUMAN_RM", "Human_RM" ), ## Future analyses can be added here to pull those bams for eager processsing. TRUE ~ NA_character_ ) @@ -75,7 +76,7 @@ parser <- add_option(parser, c("-s", "--sequencing_batch_id"), type = 'character parser <- add_option(parser, c("-a", "--analysis_type"), type = 'character', action = "callback", dest = "analysis_type", callback = validate_analysis_type, default=NA, - help = "The analysis type to compile the data from. Should be one of: 'SG', 'TF'.") + help = "The analysis type to compile the data from. Should be one of: 'SG', 'TF', 'RP', 'RM'.") parser <- add_option(parser, c("-r", "--rename"), type = 'logical', action = 'store_true', dest = 'rename', default=F, help = "Changes all dots (.) in the Library_ID field of the output to underscores (_). diff --git a/scripts/run_Eager.sh b/scripts/run_Eager.sh index dc6c241..c32530d 100755 --- a/scripts/run_Eager.sh +++ b/scripts/run_Eager.sh @@ -35,7 +35,7 @@ Yellow=$(tput sgr0)'\033[1;33m' ## Yellow normal face ## Since I'm running through all data every time, the runtime of the script will increase marginally over time. ## Maybe create a list of eager inputs that are newer than the MQC reports and use that to loop over? -for analysis_type in "SG" "TF" "RP"; do +for analysis_type in "SG" "TF" "RP" "RM"; do # echo ${analysis_type} analysis_profiles="${nextflow_profiles},${analysis_type}" # echo "${root_input_dir}/${analysis_type}" diff --git a/scripts/update_poseidon_package.sh b/scripts/update_poseidon_package.sh index e346ab6..1d33ab5 100755 --- a/scripts/update_poseidon_package.sh +++ b/scripts/update_poseidon_package.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -VERSION="1.4.0" +VERSION="1.5.0" ## Colours for printing to terminal Yellow=$(tput sgr0)'\033[1;33m' ## Yellow normal face