Merge pull request #19 from MPI-EVA-Archaeogenetics/dev

Add ethical sample scrubbing. v1.3.0
MPI-EVA-Archaeogenetics · Aug 24, 2023 · 72e9e79 · 72e9e79
2 parents 99b3259 + d33cadd
commit 72e9e79
Show file tree

Hide file tree

Showing 10 changed files with 303 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,3 +15,5 @@ test_data/
 eager_inputs_old/
 eager_outputs_old/
 array_Logs/
+poseidon_packages/
+debug_tables/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,20 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.3.0] - 12/07/2023
+
+### `Added`
+ - `scripts/ethical_sample_scrub.sh`: A script to remove eager input/outputs for samples that were marked as ethically sensitive after the pipelines picked them up.
+ - `scripts/cron_ethical_scrub.sh`: A cron-able script to run `ethical_sample_scrub.sh` daily.
+ - `scripts/clear_work_dirs.sh`: A bash script to `rm -r` the work directories of an individual ID for both `SG` and `TF` processing.
+ - `scripts/clear_results.sh`: A bash script that deletes the results for an individual while maintaining the nextflow process cache for them.
+
+### `Fixed`
+ - `scripts/cron_daily_prepare.sh`: Silenced permission errors due to ethical sample scrubbing.
+### `Dependencies`
+
+### `Deprecated`
+
 ## [1.2.0] - 21/03/2023
 
 ### `Added`

diff --git a/README.md b/README.md
@@ -146,3 +146,46 @@ Comparing the timestamp of the Autorun_eager genotypes and those in the poseidon
 5. Use `trident update` to bump the package version (`1.0.0` if the package is newly created), and create a Changelog.
 6. Validate the resulting package.
 7. If validation passes, publish the (updated) version of the package to the central repository in `poseidon_packages/` and remove any temporary files created.
+
+## ethical_sample_scrub.sh
+
+A shell script that scrubs the Autorun_eager input and output directories of all individuals in a specified list of sensitive sequencing IDs. This is used daily with the most up-to-date list of sensitive sequencing IDs to ensure that no results are available even if marking samples as sensitive was done late.
+
+```
+     usage: ethical_sample_scrub.sh [options] <sensitive_seqIds_list>
+
+This script pulls the Pandora individual IDs from the list of sensitive sequencing IDs, and
+    removes all Autorun_eager input and outputs from those individuals (if any).
+    This ensures that no results are available even if marking samples as sensitive was done late.
+
+Options:
+-h, --help		Print this text and exit.
+```
+
+## clear_work_dirs.sh
+
+A shell script that will clear the work directories of individuals in a specified individual ID list from both the SG and TF results directories.
+
+```
+     usage: clear_work_dirs.sh [options] <ind_id_list>
+
+This script clears the work directories of individuals in a specified individual ID list from both the SG and TF results directories.
+
+Options:
+-h, --help		Print this text and exit.
+```
+
+## clear_results.sh
+
+A shell script that clears the results directories of all individuals in a specified list While maintaining nextflow's caching of already-ran processes. This is useful for refreshing the results directories of individuals when changes to the input might have changes merging of libraries, thus making the directory structure inconsistent.
+
+```
+     usage: clear_results.sh [options] <ind_id_list>
+
+This script removes all output directory contents for the provided individuals, without clearing out caching, allowing for the results to be re-published.
+    This enables refreshing of result directories when changes to the input might have changes merging of libraries, thus making the directory structure inconsistent.
+
+Options:
+-h, --help		Print this text and exit.
+-a, --analysis_type		Set the analysis type. Options: TF, SG.
+```
diff --git a/scripts/clear_results.sh b/scripts/clear_results.sh
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+
+## This script removes the results for an individiaul while maintaining the nextflow process cache for them.
+##    It is intended as a way to refresh the results directories of an individual. This can be useful either
+##    to remove older files after additional libraries appear and are therefore merged, or to remove results
+##    with misleading names in cases where Pandora entries get updated (e.g. protocol mixup leading to changes
+##    in strandedness for a library).
+
+## Helptext function
+function Helptext() {
+  echo -ne "\t usage: $0 [options] <ind_id_list>\n\n"
+  echo -ne "This script removes all output directory contents for the provided individuals, without clearing out caching, allowing for the results to be re-published.\n    This enables refreshing of result directories when changes to the input might have changes merging of libraries, thus making the directory structure inconsistent.\n\n"
+  echo -ne "Options:\n"
+  echo -ne "-h, --help\t\tPrint this text and exit.\n"
+  echo -ne "-a, --analysis_type\t\tSet the analysis type. Options: TF, SG.\n"
+}
+
+## Print messages to stderr, optionally with colours
+function errecho() {
+  local Normal
+  local Red
+  local Yellow
+  local colour
+
+  Normal=$(tput sgr0)
+  Red=$(tput sgr0)'\033[1;31m' ## Red normal face
+  Yellow=$(tput sgr0)'\033[1;33m' ## Yellow normal face
+
+  colour=''
+  if [[ ${1} == '-y' ]]; then
+    colour="${Yellow}"
+    shift 1
+  elif [[ ${1} == '-r' ]]; then
+    colour="${Red}"
+    shift 1
+  fi
+  echo -e ${colour}$*${Normal} 1>&2
+}
+
+## Parse CLI args.
+TEMP=`getopt -q -o ha: --long analysis_type:,help -n 'clear_results.sh' -- "$@"`
+eval set -- "$TEMP"
+
+## Default parameters
+ind_id_list_fn=''
+analysis_type=''
+
+## Read in CLI arguments
+while true ; do
+  case "$1" in
+    -h|--help) Helptext; exit 0 ;;
+    -a|--analysis_type) analysis_type="${2}"; shift 2;;
+    --) ind_id_list_fn="${2}"; break ;;
+    *) echo -e "invalid option provided: $1.\n"; Helptext; exit 1;;
+  esac
+done
+
+## Validate inputs
+if [[ ${ind_id_list_fn} == '' ]]; then
+  errecho "No individual ID list provided.\n"
+  Helptext
+  exit 1
+fi
+
+if [[ ${analysis_type} == '' ]]; then
+  errecho "No --analysis_type was provided.\n"
+  Helptext
+elif [[ ${analysis_type} != "SG" && ${analysis_type} != "TF" ]]; then
+  errecho "analysis_type must be SG or TF. You provided: ${analysis_type}\n"
+  Helptext
+fi
+
+root_eager_dir='/mnt/archgen/Autorun_eager/eager_outputs' ## Directory should include subdirectories for each analysis type (TF/SG) and sub-subdirectories for each site and individual.
+
+## Read all individual IDs into an array
+input_iids=($(cat ${ind_id_list_fn}))
+
+## Remove all dirs except for 'work' and 'pipeline_info'. 
+##    Both needed for caching. 
+##    Also leave '1240k.imputed' and 'GTL_output' alone.
+for ind_id in ${input_iids[@]}; do
+  site_id=${ind_id:0:3} ## Site id is the first three characters of the individual ID
+  dirs_to_delete=$(ls -1 -d ${root_eager_dir}/${analysis_type}/${site_id}/${ind_id}/* | grep -vw -e 'work' -e '1240k.imputed' -e 'GTL_output' -e 'pipeline_info')
+  for dir in ${dirs_to_delete}; do
+    errecho "Deleting results in: ${dir}"
+    rm -r ${dir} ## Delete the specific result directory and all its contents
+  done
+done
diff --git a/scripts/clear_work_dirs.sh b/scripts/clear_work_dirs.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+
+## This script accepts a list of individual IDs and clears the nextflow work directories for both SG and TF data processing of each ID.
+
+## Helptext function
+function Helptext() {
+  echo -ne "\t usage: $0 [options] <ind_id_list>\n\n"
+  echo -ne "This script clears the work directories of individuals in a specified individual ID list from both the SG and TF results directories.\n\n"
+  echo -ne "Options:\n"
+  echo -ne "-h, --help\t\tPrint this text and exit.\n"
+}
+
+## Print messages to stderr
+function errecho() { echo -e $* 1>&2 ;}
+
+## Parse CLI args.
+TEMP=`getopt -q -o h --long help -n 'clean_work_dirs.sh' -- "$@"`
+eval set -- "$TEMP"
+
+ind_id_list_fn=''
+
+## Read in CLI arguments
+while true ; do
+  case "$1" in
+    -h|--help) Helptext; exit 0 ;;
+    --) ind_id_list_fn="${2}"; break ;;
+    *) echo -e "invalid option provided: $1.\n"; Helptext; exit 1;;
+  esac
+done
+
+if [[ ${ind_id_list_fn} == '' ]]; then
+  echo -e "No individual ID list provided.\n"
+  Helptext
+  exit 1
+fi
+
+root_eager_dir='/mnt/archgen/Autorun_eager/eager_outputs' ## Directory should include subdirectories for each analysis type (TF/SG) and sub-subdirectories for each site and individual.
+
+## Read all individual IDs into an array
+input_iids=($(cat ${ind_id_list_fn}))
+
+for ind_id in ${input_iids[@]}; do
+  site_id=${ind_id:0:3} ## Site id is the first three characters of the individual ID
+  errecho -ne "Clearing work directories for ${ind_id}..."
+  for analysis_type in SG TF; do
+    if [[ -d ${root_eager_dir}/${analysis_type}/${site_id}/${ind_id}/work ]]; then
+      errecho -ne " ${analysis_type}..."
+      # ls -d ${root_eager_dir}/${analysis_type}/${site_id}/${ind_id}/work
+      rm -rf ${root_eager_dir}/${analysis_type}/${site_id}/${ind_id}/work
+    fi
+  done
+  errecho ''
+done
diff --git a/scripts/cron_daily_prepare.sh b/scripts/cron_daily_prepare.sh
@@ -7,15 +7,15 @@
 cd /mnt/archgen/Autorun_eager
 
 # 1240k
-# Note: this find only checks runs starting from 2020
-find /mnt/archgen/Autorun/Results/Human_1240k/2* -name '*.bam' -mtime -1 | cut -f 7 -d "/"| sort -u| while read RUN ; do
+# Note: this find only checks runs starting from 2020. Silence stderr to avoid 'permission denied'.
+find /mnt/archgen/Autorun/Results/Human_1240k/2* -name '*.bam' -mtime -1 2>/dev/null | cut -f 7 -d "/" | sort -u | while read RUN ; do
     echo "Processing TF data from run: ${RUN}"
     scripts/prepare_eager_tsv.R -s $RUN -a TF -o eager_inputs/ -d .eva_credentials
 done 
 
 # Shotgun
-# Note: this find only checks runs starting from 2020
-find /mnt/archgen/Autorun/Results/Human_Shotgun/2* -name '*.bam' -mtime -1 | cut -f 7 -d "/"| sort -u| while read RUN ; do
+# Note: this find only checks runs starting from 2020.  Silence stderr to avoid 'permission denied'.
+find /mnt/archgen/Autorun/Results/Human_Shotgun/2* -name '*.bam' -mtime -1 2>/dev/null | cut -f 7 -d "/" | sort -u | while read RUN ; do
     echo "Processing SG data from run: ${RUN}"
     scripts/prepare_eager_tsv.R -s $RUN -a SG -o eager_inputs/ -d .eva_credentials
 done 
diff --git a/scripts/cron_ethical_scrub.sh b/scripts/cron_ethical_scrub.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+## Use ethically_culturally_sensitive list to scrub any sensitive sample results
+
+cd /mnt/archgen/Autorun_eager
+
+list_fn="/mnt/archgen/Autorun/Pandora_Tables/Ethically_Sensitive.txt"
+
+scripts/ethical_sample_scrub.sh ${list_fn}
diff --git a/scripts/ethical_sample_scrub.sh b/scripts/ethical_sample_scrub.sh
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+
+## Helptext function
+function Helptext() {
+  echo -ne "\t usage: $0 [options] <sensitive_seqIds_list>\n\n"
+  echo -ne "This script pulls the Pandora individual IDs from the list of sensitive sequencing IDs, and\n    removes all Autorun_eager input and outputs from those individuals (if any).\n    This ensures that no results are available even if marking samples as sensitive was done late.\n\n"
+  echo -ne "Options:\n"
+  echo -ne "-h, --help\t\tPrint this text and exit.\n"
+}
+
+## Print messages to stderr, optionally with colours
+function errecho() {
+  local Normal
+  local Red
+  local Yellow
+  local colour
+
+  Normal=$(tput sgr0)
+  Red=$(tput sgr0)'\033[1;31m' ## Red normal face
+  Yellow=$(tput sgr0)'\033[1;33m' ## Yellow normal face
+
+  colour=''
+  if [[ ${1} == '-y' ]]; then
+    colour="${Yellow}"
+    shift 1
+  elif [[ ${1} == '-r' ]]; then
+    colour="${Red}"
+    shift 1
+  fi
+  echo -e ${colour}$*${Normal} 1>&2
+}
+
+## Parse CLI args.
+TEMP=`getopt -q -o h --long help -n 'ethical_sample_scrub.sh' -- "$@"`
+eval set -- "$TEMP"
+
+## Read in CLI arguments
+while true ; do
+  case "$1" in
+    -h|--help) Helptext; exit 0 ;;
+    --) sensitive_seq_id_list="${2}"; break ;;
+    *) echo -e "invalid option provided: $1.\n"; Helptext; exit 1;;
+  esac
+done
+
+## Hardcoded paths
+root_input_dir='/mnt/archgen/Autorun_eager/eager_inputs'   ## Directory should include subdirectories for each analysis type (TF/SG) and sub-subdirectories for each site and individual.
+root_output_dir='/mnt/archgen/Autorun_eager/eager_outputs' ## Directory should include subdirectories for each analysis type (TF/SG) and sub-subdirectories for each site and individual.
+
+
+if [[ ${sensitive_seq_id_list} = '' ]]; then
+    echo -e "No input file provided.\n"
+    Helptext
+    exit 1
+fi
+
+if [[ ! -f ${sensitive_seq_id_list} ]]; then
+  echo "File not found: ${sensitive_seq_id_list}"
+  exit 1
+else
+  ## Create list of unique individual IDs from the list of sensitive seq_ids
+  scrub_me=($(cut -d '.' -f 1 ${sensitive_seq_id_list} | sort -u ))
+
+  ## If the individuals were flagged as sensitive AFTER processing started, both the inputs and outputs should be made inaccessible.
+  for raw_iid in ${scrub_me[@]}; do
+    for analysis_type in "SG" "TF"; do
+      ## EAGER_INPUTS
+      site_id="${raw_iid:0:3}"
+      eager_input_tsv="${root_input_dir}/${analysis_type}/${site_id}/${raw_iid}/${raw_iid}.tsv"
+      ## If the eager inpput exists, hide the entire directory and make it inaccessible
+      if [[ -f ${eager_input_tsv} ]]; then
+        old_name=$(dirname ${eager_input_tsv})
+        new_name=$(dirname ${old_name})/.${raw_iid}
+        mv -v ${old_name} ${new_name} ## Hide the input directory
+        chmod 0700 ${new_name}        ## Restrict the directory contents
+      fi
+
+      ## EAGER_OUTPUTS
+      eager_output_dir="${root_output_dir}/${analysis_type}/${site_id}/${raw_iid}/"
+      if [[ -d ${eager_output_dir} ]]; then
+        new_outdir_name=$(dirname ${eager_output_dir})/.${raw_iid}
+        mv -v ${eager_output_dir} ${new_outdir_name} ## Hide the output directory
+        chmod 0700 ${new_outdir_name}                ## Restrict the directory contents
+      fi
+    done
+  done
+fi
+
diff --git a/scripts/prepare_eager_tsv.R b/scripts/prepare_eager_tsv.R
@@ -46,7 +46,7 @@ save_ind_tsv <- function(data, rename, output_dir, ...) {
   data %>% select(-individual.Full_Individual_Id) %>%  readr::write_tsv(file=paste0(ind_dir,"/",ind_id,".tsv")) ## Output structure can be changed here.
 
   ## Print Autorun_eager version to file
-  AE_version <- "1.2.0"
+  AE_version <- "1.3.0"
   cat(AE_version, file=paste0(ind_dir,"/autorun_eager_version.txt"), fill=T, append = F)
 }
 

diff --git a/scripts/update_poseidon_package.sh b/scripts/update_poseidon_package.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-VERSION="1.2.0"
+VERSION="1.3.0"
 
 ## Colours for printing to terminal
 Yellow=$(tput sgr0)'\033[1;33m' ## Yellow normal face