Merge pull request #23 from MPI-EVA-Archaeogenetics/dev

v1.4.0
MPI-EVA-Archaeogenetics · Nov 4, 2024 · 7033abf · 7033abf
2 parents 7099275 + ead76a5
commit 7033abf
Show file tree

Hide file tree

Showing 11 changed files with 177 additions and 72 deletions.
diff --git a/.gitignore b/.gitignore
@@ -16,4 +16,5 @@ eager_inputs_old/
 eager_outputs_old/
 array_Logs/
 poseidon_packages/
-debug_tables/
+debug_tables/
+*.txt
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,25 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.5.0] - 30/09/2024
+
+### `Added`
+
+- Processing of RM data. (Twist capture + mtDNA)
+  - `conf/Autorun.config`: Add RP profile for processing twist capture results. Identical to TF for now.
+  - `scripts/prepare_eager_tsv.R`: Add RP analysis type for twist capture results
+  - `scripts/run_Eager.sh`: Run for RP TSVs.
+  - `scripts/cron_daily_prepare.sh`: Create RP analysis TSVs daily.
+  - `scripts/ethical_sample_scrub.sh`: Add RP analysis type for ethical sample scrubbing.
+  - `scripts/clear_work_dirs.sh`: Add RP analysis type for work directory clearing.
+  - `scripts/clear_results.sh`: Add RP analysis type for results directory clearing.
+
+### `Fixed`
+
+### `Dependencies`
+
+### `Deprecated`
+
 ## [1.4.0] - 12/07/2023
 
 ### `Added`

diff --git a/README.md b/README.md
@@ -4,25 +4,27 @@ Automated nf-core/eager processing of Autorun output bams.
 
 ## Quickstart
 
-- Run `prepare_eager_tsv.R` for human SG or TF data for a given sequencing batch:
+- Run `prepare_eager_tsv.R` for human SG, TF, RP, or RM data for a given sequencing batch:
 
     ```bash
     prepare_eager_tsv.R -s <batch_Id> -a SG -o eager_inputs/ -d .eva_credentials
     prepare_eager_tsv.R -s <batch_Id> -a TF -o eager_inputs/ -d .eva_credentials
+    prepare_eager_tsv.R -s <batch_Id> -a RP -o eager_inputs/ -d .eva_credentials
+    prepare_eager_tsv.R -s <batch_Id> -a RM -o eager_inputs/ -d .eva_credentials
     ```
 
 - Run eager with the following script, which then runs on the generated TSV files:
 
     ```bash
-    run_Eager.sh
+    run_Eager.sh -a
     ```
 
 ⚠️ For some library preparation protocols and external libraries, UDG treatment cannot be reliably inferred, and errors will be thrown.
 In such cases, an eager input TSV will still be created, but UDG treatment for affected libraries will be set to 'Unknown' and needs to be manually edited.
 
 ## Autorun.config
 
-Contains the `autorun`, `SG` and `TF` profiles.
+Contains the `autorun`, `local_paths`, `SG`, `TF`, `RP`, and `RM`  profiles.
 
 ### autorun
 
@@ -43,43 +45,51 @@ The standardised parameters for processing human shotgun data.
 
 The standardised parameters for processing human 1240k capture data.
 
-## prepare_eager_tsv.R
-
-An R script that when given a sequencing batch ID, Autorun Analysis type and PANDORA credentials will create/update eager input TSV files for further processing.
-
-```bash
-Usage: ./prepare_eager_tsv.R [options] .credentials
+### RP
 
-Options:
-    -h, --help
-        Show this help message and exit
+The standardised parameters for processing human Twist capture data.
 
-    -s SEQUENCING_BATCH_ID, --sequencing_batch_id=SEQUENCING_BATCH_ID
-        The Pandora sequencing batch ID to update eager input for. A TSV file will be prepared
-            for each individual in this run, containing all relevant processed BAM files
-            from the individual
+### RM
 
-    -a ANALYSIS_TYPE, --analysis_type=ANALYSIS_TYPE
-        The analysis type to compile the data from. Should be one of: 'SG', 'TF'.
+The standardised parameters for processing human Twist+MT capture data.
 
-    -r, --rename
-        Changes all dots (.) in the Library_ID field of the output to underscores (_).
-            Some tools used in nf-core/eager will strip everything after the first dot (.)
-            from the name of the input file, which can cause naming conflicts in rare cases.
+## prepare_eager_tsv.R
 
-    -w WHITELIST, --whitelist=WHITELIST
-            An optional file that includes the IDs of whitelisted individuals,
-                    one per line. Only the TSVs for these individuals will be updated.
+An R script that when given a sequencing batch ID, Autorun Analysis type and PANDORA credentials will create/update eager input TSV files for further processing.
 
-    -o OUTDIR/, --outDir=OUTDIR/
-        The desired output directory. Within this directory, one subdirectory will be 
-            created per analysis type, within that one subdirectory per individual ID,
-            and one TSV within each of these directory.
+```bash
+Usage: ./scripts/prepare_eager_tsv.R [options] .credentials
 
-    -d, --debug_output
-        When provided, the entire result table for the run will be saved as '<seq_batch_ID>.results.txt'.
-            Helpful to check all the output data in one place.
 
+Options:
+	-h, --help
+		Show this help message and exit
+
+	-s SEQUENCING_BATCH_ID, --sequencing_batch_id=SEQUENCING_BATCH_ID
+		The Pandora sequencing batch ID to update eager input for. A TSV file will be prepared
+			for each individual in this run, containing all relevant processed BAM files
+			from the individual
+
+	-a ANALYSIS_TYPE, --analysis_type=ANALYSIS_TYPE
+		The analysis type to compile the data from. Should be one of: 'SG', 'TF', 'RP', 'RM'.
+
+	-r, --rename
+		Changes all dots (.) in the Library_ID field of the output to underscores (_).
+			Some tools used in nf-core/eager will strip everything after the first dot (.)
+			from the name of the input file, which can cause naming conflicts in rare cases.
+
+	-w WHITELIST, --whitelist=WHITELIST
+		An optional file that includes the IDs of whitelisted individuals,
+			one per line. Only the TSVs for these individuals will be updated.
+
+	-o OUTDIR, --outDir=OUTDIR
+		The desired output directory. Within this directory, one subdirectory will be 
+			created per analysis type, within that one subdirectory per individual ID,
+			and one TSV within each of these directory.
+
+	-d, --debug_output
+		When provided, the entire result table for the run will be saved as '<seq_batch_ID>.results.txt'.
+			Helpful to check all the output data in one place.
 Note: a valid sidora .credentials file is required. Contact the Pandora/Sidora team for details.
 ```
 
@@ -88,13 +98,21 @@ The eager input TSVs will be created in the following directory structure, given
 ```text
 eager_inputs
 ├── SG
-│   └──IND
-│       ├── IND001
-│       └── IND002
-└── TF
-    └──IND
-         ├── IND001
-         └── IND002
+│   └──ABC
+│       ├── ABC001
+│       └── ABC002
+├── TF
+│   └──ABC
+│       ├── ABC001
+│       └── ABC002
+├── RP
+│   └──ABC
+│       ├── ABC001
+│       └── ABC002
+└── RM
+    └──ABC
+         ├── ABC001
+         └── ABC002
 ```
 
 Alongside each created TSV is a file named `autorun_eager_version.txt`, which states the version of Autorun_eager used.
@@ -112,13 +130,21 @@ The outputs are saved with the same directory structure as the inputs, but in a
 ```text
 eager_outputs
 ├── SG
-│   └──IND
-│       ├── IND001
-│       └── IND002
-└── TF
-    └──IND
-         ├── IND001
-         └── IND002
+│   └──ABC
+│       ├── ABC001
+│       └── ABC002
+├── TF
+│   └──ABC
+│       ├── ABC001
+│       └── ABC002
+├── RP
+│   └──ABC
+│       ├── ABC001
+│       └── ABC002
+└── RM
+    └──ABC
+         ├── ABC001
+         └── ABC002
 ```
 
 This script recognises the `-a/--array` option. When this is provided, instead of running eager jobs in sequence, a temporary file is created named `$(date +'%y%m%d_%H%M')_Autorun_eager_queue.txt` that includes the command line of all eager jobs to-be-ran, one per line. An "Autorun_eager spawner" (`AE_spawner`) array job is then submitted using `qsub`, which uses a secondary script named `scripts/submit_as_array.sh` to submit the command in each line of the temporary file as a separate task. In this manner, 10 eager runs can be ran in parallel. Logs for these jobs will then be added to a directory named `array_Logs/<temp_file_name>/`.

diff --git a/conf/Autorun.config b/conf/Autorun.config
@@ -26,7 +26,7 @@ profiles {
       fasta_index           = '/mnt/archgen/Reference_Genomes/Human/hs37d5/hs37d5.fa.fai'
       bwa_index             = '/mnt/archgen/Reference_Genomes/Human/hs37d5/'
       seq_dict              = '/mnt/archgen/Reference_Genomes/Human/hs37d5/hs37d5.dict'
-      
+
       // Qualimap bedfile for on-target coverage calculation
       snpcapture_bed        = '/mnt/archgen/Reference_Genomes/Human/hs37d5/SNPCapBEDs/1240K.pos.list_hs37d5.0based.bed'
 
@@ -50,14 +50,14 @@ profiles {
       bam_mapping_quality_threshold = 0   // Keep all mapped reads 
       bam_unmapped_type = 'bam'          // Keep unmapped reads as a separate BAM file for possible future pathogen screening.
       bam_filter_minreadlength = 30      // Do we need to add length filtering here at all? Does Kay's pre-processing do this?
-      
+
       // mtDNA to nuclear ratio
       run_mtnucratio = true
       mtnucratio_header = "MT"
-      
+
       // Ignore SNP capture bed for coverage calculations in non TF data.
       snpcapture_bed        = null
-      
+
       // Bam Trimming
       // ssDNA libraries are left untrimmed (pileupcaller deals with damage in those)
       // dsDNA half-udg are clipped 2bp on either side, while non-UDG are clipper 7bp 
@@ -89,7 +89,7 @@ profiles {
       // Nuclear contamination
       run_nuclear_contamination = true
       contamination_chrom_name = 'X'
-      
+
       //1240k Coverage/Depth calculation (for poseidonisation)
       run_bedtools_coverage = true
     }
@@ -104,11 +104,11 @@ profiles {
       bam_mapping_quality_threshold = 0   // Keep all mapped reads 
       bam_unmapped_type = 'bam'          // Keep unmapped reads as a separate BAM file for possible future pathogen screening.
       bam_filter_minreadlength = 30      // Do we need to add length filtering here at all? Does Kay's pre-processing do this?
-      
+
       // mtDNA to nuclear ratio
       run_mtnucratio = true
       mtnucratio_header = "MT"
-      
+
       // Bam Trimming
       // ssDNA libraries are left untrimmed (pileupcaller deals with damage in those)
       // dsDNA half-udg are clipped 2bp on either side, while non-UDG are clipper 7bp 
@@ -140,7 +140,7 @@ profiles {
       // Nuclear contamination
       run_nuclear_contamination = true
       contamination_chrom_name = 'X'
-            
+
       //1240k Coverage/Depth calculation (for poseidonisation)
       run_bedtools_coverage = true
     }
@@ -155,11 +155,11 @@ profiles {
       bam_mapping_quality_threshold = 0   // Keep all mapped reads 
       bam_unmapped_type = 'bam'          // Keep unmapped reads as a separate BAM file for possible future pathogen screening.
       bam_filter_minreadlength = 30      // Do we need to add length filtering here at all? Does Kay's pre-processing do this?
-      
+
       // mtDNA to nuclear ratio
       run_mtnucratio = true
       mtnucratio_header = "MT"
-      
+
       // Bam Trimming
       // ssDNA libraries are left untrimmed (pileupcaller deals with damage in those)
       // dsDNA half-udg are clipped 2bp on either side, while non-UDG are clipper 7bp 
@@ -191,7 +191,58 @@ profiles {
       // Nuclear contamination
       run_nuclear_contamination = true
       contamination_chrom_name = 'X'
-
+
+      //1240k Coverage/Depth calculation (for poseidonisation)
+      run_bedtools_coverage = true
+    }
+  }
+
+  // Profile with parameters for runs using the Human_RM bams as input.
+  // Currently identical to TF profile. Just keeps the RP data separate for comparison.
+  RM {
+    params{
+      // BAM filtering
+      run_bam_filtering = true           // Filter out unmapped reads, so barplots in MultiQC are not completely overtaken by unmapped reads.
+      bam_mapping_quality_threshold = 0   // Keep all mapped reads 
+      bam_unmapped_type = 'bam'          // Keep unmapped reads as a separate BAM file for possible future pathogen screening.
+      bam_filter_minreadlength = 30      // Do we need to add length filtering here at all? Does Kay's pre-processing do this?
+
+      // mtDNA to nuclear ratio
+      run_mtnucratio = true
+      mtnucratio_header = "MT"
+
+      // Bam Trimming
+      // ssDNA libraries are left untrimmed (pileupcaller deals with damage in those)
+      // dsDNA half-udg are clipped 2bp on either side, while non-UDG are clipper 7bp 
+      run_trim_bam = true
+      bamutils_clip_single_stranded_half_udg_left = 0     // Set to 0 so ssDNA do not get trimmed.
+      bamutils_clip_single_stranded_half_udg_right = 0    // Set to 0 so ssDNA do not get trimmed.
+      bamutils_clip_single_stranded_none_udg_left = 0     // Set to 0 so ssDNA do not get trimmed.
+      bamutils_clip_single_stranded_none_udg_right = 0    // Set to 0 so ssDNA do not get trimmed.
+      bamutils_clip_double_stranded_half_udg_left = 2     // Trim 2 bp of either side for half-UDG libraries.
+      bamutils_clip_double_stranded_half_udg_right = 2    // Trim 2 bp of either side for half-UDG libraries.
+      // Usually for dsDNA non-UDG libraries this is between 5 and 10. I have set it to 7 arbitrarily since that was a good cutoff in my own projects so far.
+      bamutils_clip_double_stranded_none_udg_left = 7     // Trim 7 bp of either side for non-UDG libraries.
+      bamutils_clip_double_stranded_none_udg_right = 7    // Trim 7 bp of either side for non-UDG libraries.
+
+      // Damage Calculation
+      damage_calculation_tool = 'mapdamage'
+      mapdamage_downsample = 100000 // Use 100k reads for damage calculation to lower runtime.
+
+      // Genotyping
+      genotyping_source = 'trimmed'           // Use trimmed bams for genotyping
+      run_genotyping = true
+      genotyping_tool = 'pileupcaller'
+      pileupcaller_min_map_quality = 25       // To allow for reads aligning with a mismatch, and reduce reference bias in genotypes.
+      pileupcaller_min_base_quality = 30
+
+      //Sex determination
+      run_sexdeterrmine = true
+
+      // Nuclear contamination
+      run_nuclear_contamination = true
+      contamination_chrom_name = 'X'
+
       //1240k Coverage/Depth calculation (for poseidonisation)
       run_bedtools_coverage = true
     }

diff --git a/scripts/clear_results.sh b/scripts/clear_results.sh
@@ -12,7 +12,7 @@ function Helptext() {
   echo -ne "This script removes all output directory contents for the provided individuals, without clearing out caching, allowing for the results to be re-published.\n    This enables refreshing of result directories when changes to the input might have changes merging of libraries, thus making the directory structure inconsistent.\n\n"
   echo -ne "Options:\n"
   echo -ne "-h, --help\t\tPrint this text and exit.\n"
-  echo -ne "-a, --analysis_type\t\tSet the analysis type. Options: TF, SG.\n"
+  echo -ne "-a, --analysis_type\t\tSet the analysis type. Options: TF, SG, RP, RM.\n"
 }
 
 ## Print messages to stderr, optionally with colours
@@ -65,8 +65,8 @@ fi
 if [[ ${analysis_type} == '' ]]; then
   errecho "No --analysis_type was provided.\n"
   Helptext
-elif [[ ${analysis_type} != "SG" && ${analysis_type} != "TF" && ${analysis_type} != "RP" ]]; then
-  errecho "analysis_type must be SG, TF, or RP. You provided: ${analysis_type}\n"
+elif [[ ${analysis_type} != "SG" && ${analysis_type} != "TF" && ${analysis_type} != "RP" && ${analysis_type} != "RM" ]]; then
+  errecho "analysis_type must be SG, TF, RP, or RM. You provided: ${analysis_type}\n"
   Helptext
 fi
 

diff --git a/scripts/clear_work_dirs.sh b/scripts/clear_work_dirs.sh
@@ -42,7 +42,7 @@ input_iids=($(cat ${ind_id_list_fn}))
 for ind_id in ${input_iids[@]}; do
   site_id=${ind_id:0:3} ## Site id is the first three characters of the individual ID
   errecho -ne "Clearing work directories for ${ind_id}..."
-  for analysis_type in "SG" "TF" "RP"; do
+  for analysis_type in "SG" "TF" "RP" "RM"; do
     if [[ -d ${root_eager_dir}/${analysis_type}/${site_id}/${ind_id}/work ]]; then
       errecho -ne " ${analysis_type}..."
       # ls -d ${root_eager_dir}/${analysis_type}/${site_id}/${ind_id}/work