-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #19 from MPI-EVA-Archaeogenetics/dev
Add ethical sample scrubbing. v1.3.0
- Loading branch information
Showing
10 changed files
with
303 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,3 +15,5 @@ test_data/ | |
eager_inputs_old/ | ||
eager_outputs_old/ | ||
array_Logs/ | ||
poseidon_packages/ | ||
debug_tables/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
#!/usr/bin/env bash | ||
|
||
## This script removes the results for an individiaul while maintaining the nextflow process cache for them. | ||
## It is intended as a way to refresh the results directories of an individual. This can be useful either | ||
## to remove older files after additional libraries appear and are therefore merged, or to remove results | ||
## with misleading names in cases where Pandora entries get updated (e.g. protocol mixup leading to changes | ||
## in strandedness for a library). | ||
|
||
## Helptext function | ||
function Helptext() { | ||
echo -ne "\t usage: $0 [options] <ind_id_list>\n\n" | ||
echo -ne "This script removes all output directory contents for the provided individuals, without clearing out caching, allowing for the results to be re-published.\n This enables refreshing of result directories when changes to the input might have changes merging of libraries, thus making the directory structure inconsistent.\n\n" | ||
echo -ne "Options:\n" | ||
echo -ne "-h, --help\t\tPrint this text and exit.\n" | ||
echo -ne "-a, --analysis_type\t\tSet the analysis type. Options: TF, SG.\n" | ||
} | ||
|
||
## Print messages to stderr, optionally with colours | ||
function errecho() { | ||
local Normal | ||
local Red | ||
local Yellow | ||
local colour | ||
|
||
Normal=$(tput sgr0) | ||
Red=$(tput sgr0)'\033[1;31m' ## Red normal face | ||
Yellow=$(tput sgr0)'\033[1;33m' ## Yellow normal face | ||
|
||
colour='' | ||
if [[ ${1} == '-y' ]]; then | ||
colour="${Yellow}" | ||
shift 1 | ||
elif [[ ${1} == '-r' ]]; then | ||
colour="${Red}" | ||
shift 1 | ||
fi | ||
echo -e ${colour}$*${Normal} 1>&2 | ||
} | ||
|
||
## Parse CLI args. | ||
TEMP=`getopt -q -o ha: --long analysis_type:,help -n 'clear_results.sh' -- "$@"` | ||
eval set -- "$TEMP" | ||
|
||
## Default parameters | ||
ind_id_list_fn='' | ||
analysis_type='' | ||
|
||
## Read in CLI arguments | ||
while true ; do | ||
case "$1" in | ||
-h|--help) Helptext; exit 0 ;; | ||
-a|--analysis_type) analysis_type="${2}"; shift 2;; | ||
--) ind_id_list_fn="${2}"; break ;; | ||
*) echo -e "invalid option provided: $1.\n"; Helptext; exit 1;; | ||
esac | ||
done | ||
|
||
## Validate inputs | ||
if [[ ${ind_id_list_fn} == '' ]]; then | ||
errecho "No individual ID list provided.\n" | ||
Helptext | ||
exit 1 | ||
fi | ||
|
||
if [[ ${analysis_type} == '' ]]; then | ||
errecho "No --analysis_type was provided.\n" | ||
Helptext | ||
elif [[ ${analysis_type} != "SG" && ${analysis_type} != "TF" ]]; then | ||
errecho "analysis_type must be SG or TF. You provided: ${analysis_type}\n" | ||
Helptext | ||
fi | ||
|
||
root_eager_dir='/mnt/archgen/Autorun_eager/eager_outputs' ## Directory should include subdirectories for each analysis type (TF/SG) and sub-subdirectories for each site and individual. | ||
|
||
## Read all individual IDs into an array | ||
input_iids=($(cat ${ind_id_list_fn})) | ||
|
||
## Remove all dirs except for 'work' and 'pipeline_info'. | ||
## Both needed for caching. | ||
## Also leave '1240k.imputed' and 'GTL_output' alone. | ||
for ind_id in ${input_iids[@]}; do | ||
site_id=${ind_id:0:3} ## Site id is the first three characters of the individual ID | ||
dirs_to_delete=$(ls -1 -d ${root_eager_dir}/${analysis_type}/${site_id}/${ind_id}/* | grep -vw -e 'work' -e '1240k.imputed' -e 'GTL_output' -e 'pipeline_info') | ||
for dir in ${dirs_to_delete}; do | ||
errecho "Deleting results in: ${dir}" | ||
rm -r ${dir} ## Delete the specific result directory and all its contents | ||
done | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
#!/usr/bin/env bash | ||
|
||
## This script accepts a list of individual IDs and clears the nextflow work directories for both SG and TF data processing of each ID. | ||
|
||
## Helptext function | ||
function Helptext() { | ||
echo -ne "\t usage: $0 [options] <ind_id_list>\n\n" | ||
echo -ne "This script clears the work directories of individuals in a specified individual ID list from both the SG and TF results directories.\n\n" | ||
echo -ne "Options:\n" | ||
echo -ne "-h, --help\t\tPrint this text and exit.\n" | ||
} | ||
|
||
## Print messages to stderr | ||
function errecho() { echo -e $* 1>&2 ;} | ||
|
||
## Parse CLI args. | ||
TEMP=`getopt -q -o h --long help -n 'clean_work_dirs.sh' -- "$@"` | ||
eval set -- "$TEMP" | ||
|
||
ind_id_list_fn='' | ||
|
||
## Read in CLI arguments | ||
while true ; do | ||
case "$1" in | ||
-h|--help) Helptext; exit 0 ;; | ||
--) ind_id_list_fn="${2}"; break ;; | ||
*) echo -e "invalid option provided: $1.\n"; Helptext; exit 1;; | ||
esac | ||
done | ||
|
||
if [[ ${ind_id_list_fn} == '' ]]; then | ||
echo -e "No individual ID list provided.\n" | ||
Helptext | ||
exit 1 | ||
fi | ||
|
||
root_eager_dir='/mnt/archgen/Autorun_eager/eager_outputs' ## Directory should include subdirectories for each analysis type (TF/SG) and sub-subdirectories for each site and individual. | ||
|
||
## Read all individual IDs into an array | ||
input_iids=($(cat ${ind_id_list_fn})) | ||
|
||
for ind_id in ${input_iids[@]}; do | ||
site_id=${ind_id:0:3} ## Site id is the first three characters of the individual ID | ||
errecho -ne "Clearing work directories for ${ind_id}..." | ||
for analysis_type in SG TF; do | ||
if [[ -d ${root_eager_dir}/${analysis_type}/${site_id}/${ind_id}/work ]]; then | ||
errecho -ne " ${analysis_type}..." | ||
# ls -d ${root_eager_dir}/${analysis_type}/${site_id}/${ind_id}/work | ||
rm -rf ${root_eager_dir}/${analysis_type}/${site_id}/${ind_id}/work | ||
fi | ||
done | ||
errecho '' | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
#!/bin/bash | ||
|
||
## Use ethically_culturally_sensitive list to scrub any sensitive sample results | ||
|
||
cd /mnt/archgen/Autorun_eager | ||
|
||
list_fn="/mnt/archgen/Autorun/Pandora_Tables/Ethically_Sensitive.txt" | ||
|
||
scripts/ethical_sample_scrub.sh ${list_fn} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
#!/usr/bin/env bash | ||
|
||
## Helptext function | ||
function Helptext() { | ||
echo -ne "\t usage: $0 [options] <sensitive_seqIds_list>\n\n" | ||
echo -ne "This script pulls the Pandora individual IDs from the list of sensitive sequencing IDs, and\n removes all Autorun_eager input and outputs from those individuals (if any).\n This ensures that no results are available even if marking samples as sensitive was done late.\n\n" | ||
echo -ne "Options:\n" | ||
echo -ne "-h, --help\t\tPrint this text and exit.\n" | ||
} | ||
|
||
## Print messages to stderr, optionally with colours | ||
function errecho() { | ||
local Normal | ||
local Red | ||
local Yellow | ||
local colour | ||
|
||
Normal=$(tput sgr0) | ||
Red=$(tput sgr0)'\033[1;31m' ## Red normal face | ||
Yellow=$(tput sgr0)'\033[1;33m' ## Yellow normal face | ||
|
||
colour='' | ||
if [[ ${1} == '-y' ]]; then | ||
colour="${Yellow}" | ||
shift 1 | ||
elif [[ ${1} == '-r' ]]; then | ||
colour="${Red}" | ||
shift 1 | ||
fi | ||
echo -e ${colour}$*${Normal} 1>&2 | ||
} | ||
|
||
## Parse CLI args. | ||
TEMP=`getopt -q -o h --long help -n 'ethical_sample_scrub.sh' -- "$@"` | ||
eval set -- "$TEMP" | ||
|
||
## Read in CLI arguments | ||
while true ; do | ||
case "$1" in | ||
-h|--help) Helptext; exit 0 ;; | ||
--) sensitive_seq_id_list="${2}"; break ;; | ||
*) echo -e "invalid option provided: $1.\n"; Helptext; exit 1;; | ||
esac | ||
done | ||
|
||
## Hardcoded paths | ||
root_input_dir='/mnt/archgen/Autorun_eager/eager_inputs' ## Directory should include subdirectories for each analysis type (TF/SG) and sub-subdirectories for each site and individual. | ||
root_output_dir='/mnt/archgen/Autorun_eager/eager_outputs' ## Directory should include subdirectories for each analysis type (TF/SG) and sub-subdirectories for each site and individual. | ||
|
||
|
||
if [[ ${sensitive_seq_id_list} = '' ]]; then | ||
echo -e "No input file provided.\n" | ||
Helptext | ||
exit 1 | ||
fi | ||
|
||
if [[ ! -f ${sensitive_seq_id_list} ]]; then | ||
echo "File not found: ${sensitive_seq_id_list}" | ||
exit 1 | ||
else | ||
## Create list of unique individual IDs from the list of sensitive seq_ids | ||
scrub_me=($(cut -d '.' -f 1 ${sensitive_seq_id_list} | sort -u )) | ||
|
||
## If the individuals were flagged as sensitive AFTER processing started, both the inputs and outputs should be made inaccessible. | ||
for raw_iid in ${scrub_me[@]}; do | ||
for analysis_type in "SG" "TF"; do | ||
## EAGER_INPUTS | ||
site_id="${raw_iid:0:3}" | ||
eager_input_tsv="${root_input_dir}/${analysis_type}/${site_id}/${raw_iid}/${raw_iid}.tsv" | ||
## If the eager inpput exists, hide the entire directory and make it inaccessible | ||
if [[ -f ${eager_input_tsv} ]]; then | ||
old_name=$(dirname ${eager_input_tsv}) | ||
new_name=$(dirname ${old_name})/.${raw_iid} | ||
mv -v ${old_name} ${new_name} ## Hide the input directory | ||
chmod 0700 ${new_name} ## Restrict the directory contents | ||
fi | ||
|
||
## EAGER_OUTPUTS | ||
eager_output_dir="${root_output_dir}/${analysis_type}/${site_id}/${raw_iid}/" | ||
if [[ -d ${eager_output_dir} ]]; then | ||
new_outdir_name=$(dirname ${eager_output_dir})/.${raw_iid} | ||
mv -v ${eager_output_dir} ${new_outdir_name} ## Hide the output directory | ||
chmod 0700 ${new_outdir_name} ## Restrict the directory contents | ||
fi | ||
done | ||
done | ||
fi | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters