From b88d17f240b46ae6cce5c5a7c8e0b013e8b89520 Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Thu, 10 Oct 2024 19:04:06 +0100 Subject: [PATCH 01/24] changes on tag Here are some changes and benefits for data monitoring: 1. Tag each process using not only id but also chr 2. omit the red check, which is not being used. --- bin/basic_qc_nf.py | 62 ------------------------- config/basic.config | 7 ++- config/gwascatalog.config | 7 +++ config/test.config | 44 +++++++++++++++++- modules/local/ftp_copy.nf | 7 +-- modules/local/generate_strand_counts.nf | 2 +- modules/local/harmonization.nf | 4 +- modules/local/ten_percent_counts.nf | 2 +- subworkflows/local/move_files.nf | 3 +- 9 files changed, 65 insertions(+), 73 deletions(-) diff --git a/bin/basic_qc_nf.py b/bin/basic_qc_nf.py index 413ab931..4a05294e 100755 --- a/bin/basic_qc_nf.py +++ b/bin/basic_qc_nf.py @@ -30,31 +30,6 @@ # - if chr and bp not ints: remove row # 5) set chr 'x' and 'y' to 23 and 24 - - -class sqlClient(): - def __init__(self, database): - self.database = database - self.conn = self.create_conn() - self.cur = self.conn.cursor() - - def create_conn(self): - try: - conn = sqlite3.connect(self.database) - conn.row_factory = sqlite3.Row - return conn - except NameError as e: - print(e) - return None - - def get_synonyms(self, rsid): - data = [] - for row in self.cur.execute("select name from variation_synonym where variation_id in (select variation_id from variation_synonym where name =?)", (rsid,)): - data.append(row[0]) - return data - - - hm_header_transformations = { # variant id @@ -150,40 +125,6 @@ def drop_last_element_from_filename(filename): filename_parts = filename.split('-') return '-'.join(filename_parts[:-1]) - -""" - def resolve_invalid_rsids(row, header, ensembl_client=None, sql_client=None): - hm_rsid_idx = header.index('hm_rsid') - snp_idx = header.index(RSID) - # if possible, set variant_id to harmonised rsid - if row[hm_rsid_idx].startswith('rs'): - # check that if rsID already present is not synonym of that found in vcf - if row[snp_idx].startswith('rs') and row[snp_idx] != row[hm_rsid_idx]: - synonyms = [] - if ensembl_client: - rs_info = ensembl_client.get_rsid(row[snp_idx]) - if rs_info != "NA": - try: - synonyms = rs_info["synonyms"] - synonyms.append(rs_info["name"]) - except TypeError: - row[snp_idx] = 'NA' - elif sql_client: - synonyms = sql_client.get_synonyms(row[snp_idx]) - print(synonyms) - if row[hm_rsid_idx] in synonyms: - row[snp_idx] = row[hm_rsid_idx] - else: - row[snp_idx] = 'NA' - else: - row[snp_idx] = row[hm_rsid_idx] - # if variant_id is doesn't begin 'rs' - if not row[snp_idx].startswith('rs'): - row[snp_idx] = 'NA' - return row -""" - - def get_csv_reader(csv_file): dialect = csv.Sniffer().sniff(csv_file.readline()) csv_file.seek(0) @@ -237,9 +178,6 @@ def main(): else: # First try to replace an invalid variant_id with the hm_rsid # Checks for blanks, integers and floats: - #sql_client = sqlClient(db) if db else None - #ensembl_client = EnsemblRestClient() if not db else None - #row = resolve_invalid_rsids(row, header, ensembl_client, sql_client) row = blanks_to_NA(row) row = map_chr_values_to_numbers(row, header) unharmonisable = remove_row_if_unharmonisable(row, header) diff --git a/config/basic.config b/config/basic.config index 4f53dbd6..659e6e29 100644 --- a/config/basic.config +++ b/config/basic.config @@ -5,7 +5,7 @@ process { time = { 1.h * task.attempt } errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'ignore' } - maxRetries = 3 + maxRetries = 5 maxErrors = '-1' withName:map_to_build { @@ -18,6 +18,8 @@ process { } withName:ten_percent_counts { + memory = { 10.GB * task.attempt } + time = { 5.h * task.attempt } publishDir =[ path:{"${launchDir}/$GCST/ten_sc"}, mode: 'copy' @@ -64,6 +66,7 @@ process { } withName:qc { + memory = { 10.GB * task.attempt } time = { 5.h * task.attempt } publishDir =[ path:{"${launchDir}/$GCST/final"}, @@ -72,6 +75,8 @@ process { } withName:harmonization_log { + memory = { 10.GB * task.attempt } + time = { 3.h * task.attempt } publishDir =[ path:{"${launchDir}/$GCST/final"}, mode: 'copy' diff --git a/config/gwascatalog.config b/config/gwascatalog.config index 3089417a..b0faa886 100644 --- a/config/gwascatalog.config +++ b/config/gwascatalog.config @@ -10,8 +10,13 @@ params { } process { + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'ignore' } + maxRetries = 5 + maxErrors = '-1' withName:failed_copy { + memory = { 5.GB * task.attempt } + time = { 1.h * task.attempt } publishDir =[ path:{"${params.failed}"}, mode: 'move' @@ -19,6 +24,8 @@ process { } withName:ftp_copy { + memory = { 5.GB * task.attempt } + time = { 1.h * task.attempt } queue = 'datamover' publishDir =[ path:{"${params.ftp}"}, diff --git a/config/test.config b/config/test.config index 4d41af57..1150619d 100644 --- a/config/test.config +++ b/config/test.config @@ -8,6 +8,7 @@ params { } process{ + withName:map_to_build { memory = { 3.GB * task.attempt } time = { 3.h * task.attempt } @@ -16,7 +17,23 @@ process{ mode: 'copy' ] } - + + withName:ten_percent_counts { + memory = { 3.GB * task.attempt } + time = { 3.h * task.attempt } + publishDir =[ + path:{"${launchDir}/$GCST/ten_sc"}, + mode: 'copy' + ] + } + + withName:ten_percent_counts_sum { + publishDir =[ + path:{"${launchDir}/$GCST"}, + mode: 'copy' + ] + } + withName:generate_strand_counts { memory = { 3.GB * task.attempt } time = { 3.h * task.attempt } @@ -26,6 +43,13 @@ process{ ] } + withName:summarise_strand_counts { + publishDir =[ + path:{"${launchDir}/$GCST"}, + mode: 'copy' + ] + } + withName:harmonization { memory = { 3.GB * task.attempt } time = { 3.h * task.attempt } @@ -34,4 +58,22 @@ process{ mode: 'copy' ] } + + withName:qc { + memory = { 3.GB * task.attempt } + time = { 1.h * task.attempt } + publishDir =[ + path:{"${launchDir}/$GCST/final"}, + mode: 'copy' + ] + } + + withName:harmonization_log { + memory = { 3.GB * task.attempt } + time = { 1.h * task.attempt } + publishDir =[ + path:{"${launchDir}/$GCST/final"}, + mode: 'copy' + ] + } } diff --git a/modules/local/ftp_copy.nf b/modules/local/ftp_copy.nf index 98c0fb9d..f4c028ae 100644 --- a/modules/local/ftp_copy.nf +++ b/modules/local/ftp_copy.nf @@ -1,11 +1,12 @@ process ftp_copy{ + tag "$GCST" //conda (params.enable_conda ? "$projectDir/environments/conda_environment.yml" : null) //def dockerimg = "ebispot/gwas-sumstats-harmoniser:latest" //container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'docker://ebispot/gwas-sumstats-harmoniser:latest' : dockerimg }" input: - tuple val(GCST), path(raw_yaml), path(tsv), path(qc_tsv), path (log), path (yaml), val(status) + tuple val(GCST), path(raw_yaml), path(tsv), path(qc_tsv), path(running_log), path(yaml), val(status) output: tuple val(GCST), val(status), env(copy), emit: done @@ -30,8 +31,8 @@ process ftp_copy{ md5_h_tbi=\$(md5sum<${launchDir}/$GCST/final/${GCST}.h.tsv.gz.tbi | awk '{print \$1}') cp ${launchDir}/$GCST/final/${GCST}.h.tsv.gz.tbi \$path - cp ${launchDir}/$GCST/final/${GCST}.running.log \$path - cp ${launchDir}/$GCST/final/${GCST}.h.tsv.gz-meta.yaml \$path + cp $running_log \$path + cp $yaml \$path if [ \$md5_h_tsv==\$md5_h_tsv_copied ] then diff --git a/modules/local/generate_strand_counts.nf b/modules/local/generate_strand_counts.nf index 81805b0b..8197232c 100644 --- a/modules/local/generate_strand_counts.nf +++ b/modules/local/generate_strand_counts.nf @@ -1,5 +1,5 @@ process generate_strand_counts { - tag "$GCST" + tag "${GCST}_${chrom}" conda (params.enable_conda ? "${task.ext.conda}" : null) diff --git a/modules/local/harmonization.nf b/modules/local/harmonization.nf index 798c2a6a..3f4d83d9 100644 --- a/modules/local/harmonization.nf +++ b/modules/local/harmonization.nf @@ -1,5 +1,5 @@ process harmonization { - tag "$GCST" + tag "${GCST}_${chrom}" conda (params.enable_conda ? "${task.ext.conda}" : null) @@ -7,8 +7,6 @@ process harmonization { !task.ext.singularity_pull_docker_container ? "${task.ext.singularity}${task.ext.singularity_version}" : "${task.ext.docker}${task.ext.docker_version}" }" - - tag "$GCST" input: tuple val(GCST), val(palin_mode), val(status), val(chrom), path(merged), path(yaml), path(ref) diff --git a/modules/local/ten_percent_counts.nf b/modules/local/ten_percent_counts.nf index 5f1dbbd4..cdb1fe68 100644 --- a/modules/local/ten_percent_counts.nf +++ b/modules/local/ten_percent_counts.nf @@ -1,5 +1,5 @@ process ten_percent_counts { - tag "$GCST" + tag "${GCST}_${chrom}" conda (params.enable_conda ? "${task.ext.conda}" : null) diff --git a/subworkflows/local/move_files.nf b/subworkflows/local/move_files.nf index 09674602..7cae28cc 100644 --- a/subworkflows/local/move_files.nf +++ b/subworkflows/local/move_files.nf @@ -11,6 +11,7 @@ workflow move_files{ failed:it.contains("FAILED_HARMONIZATION")} //success_harmonized_file move to FTP //failed harmonized file move to Failed folder + //har_result_ch.success.view(): [GCST1, yaml,tsv,qc.tsv,running.log,h.tsv.gz-meta.yaml, SUCCESS_HARMONIZATION] ftp_copy(har_result_ch.success) failed_copy(har_result_ch.failed) //return channlel: [GCST,SUCCESS_HARMONIZATION,copied],[GCST,SUCCESS_HARMONIZATION,failed_copied],[GCST,SUCCESS_HARMONIZATION,failed_copy] @@ -18,4 +19,4 @@ workflow move_files{ emit: tmp=final_ch -} \ No newline at end of file +} From 668c9982e409a0c1ef4da43b2c1ca81d61319a15 Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Tue, 29 Oct 2024 14:41:38 +0000 Subject: [PATCH 02/24] updates to gwas-ssf 1. include z-score in the harmonisation 2. read the version of the pipeline from the parameter 3. update the gwas-sumstat-tools 1.0.22 4. add new test file with random name --- bin/common_constants.py | 21 ++++++++++++--------- bin/lib/SumStatRecord.py | 13 +++++++++++-- bin/log_script.sh | 14 ++++++++------ bin/main_pysam.py | 11 ++++++----- environments/conda_environment.yml | 2 +- modules/local/harmonization_log.nf | 3 ++- test_data/random_name.tsv | 5 +++++ test_data/rsID.sql | Bin 16384 -> 0 bytes workflows/gwascatalogharm.nf | 9 ++++++--- workflows/gwascatalogharm_gwascatalog.nf | 6 ++++++ 10 files changed, 57 insertions(+), 27 deletions(-) create mode 100644 test_data/random_name.tsv delete mode 100644 test_data/rsID.sql diff --git a/bin/common_constants.py b/bin/common_constants.py index fda0c384..ea9fbbd4 100755 --- a/bin/common_constants.py +++ b/bin/common_constants.py @@ -9,6 +9,7 @@ RANGE_U_DSET = 'ci_upper' RANGE_L_DSET = 'ci_lower' BETA_DSET = 'beta' +ZSCORE_DSET = 'zscore' RSID = 'rsid' SE_DSET = 'standard_error' EFFECT_DSET = 'effect_allele' @@ -20,6 +21,7 @@ HM_RANGE_U_DSET = 'hm_ci_upper' HM_RANGE_L_DSET = 'hm_ci_lower' HM_BETA_DSET = 'hm_beta' +HM_ZSCORE_DSET = 'hm_zscore' HM_EFFECT_DSET = 'hm_effect_allele' HM_OTHER_DSET = 'hm_other_allele' HM_FREQ_DSET = 'hm_effect_allele_frequency' @@ -28,9 +30,9 @@ DSET_TYPES = {SNP_DSET: str, RSID: str, PVAL_DSET: float, MANTISSA_DSET: float, EXP_DSET: int, STUDY_DSET: str, - CHR_DSET: int, BP_DSET: int, OR_DSET: float, RANGE_U_DSET: float, RANGE_L_DSET: float, BETA_DSET: float, SE_DSET: float, + CHR_DSET: int, BP_DSET: int, OR_DSET: float, RANGE_U_DSET: float, RANGE_L_DSET: float, BETA_DSET: float, SE_DSET: float, ZSCORE_DSET: float, EFFECT_DSET: str, OTHER_DSET: str, FREQ_DSET: float, HM_EFFECT_DSET: str, - HM_OTHER_DSET: str, HM_BETA_DSET: float, HM_OR_DSET: float, HM_FREQ_DSET: float, HM_CODE: int, + HM_OTHER_DSET: str, HM_BETA_DSET: float, HM_OR_DSET: float, HM_FREQ_DSET: float, HM_CODE: int, HM_ZSCORE_DSET: float, HM_VAR_ID: str, HM_RANGE_L_DSET: float, HM_RANGE_U_DSET: float, HM_CC_DSET: str} REFERENCE_DSET = MANTISSA_DSET @@ -38,20 +40,20 @@ GWAS_CATALOG_STUDY_PREFIX = 'GCST' TO_DISPLAY_DEFAULT = {SNP_DSET, RSID, PVAL_DSET, STUDY_DSET, CHR_DSET, BP_DSET, HM_OR_DSET, HM_RANGE_L_DSET, HM_RANGE_U_DSET, - HM_BETA_DSET, HM_EFFECT_DSET, HM_OTHER_DSET, HM_FREQ_DSET, HM_CODE} + HM_BETA_DSET, HM_ZSCORE_DSET, HM_EFFECT_DSET, HM_OTHER_DSET, HM_FREQ_DSET, HM_CODE} -TO_DISPLAY_RAW = {SNP_DSET, RSID, PVAL_DSET, STUDY_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET, +TO_DISPLAY_RAW = {SNP_DSET, RSID, PVAL_DSET, STUDY_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET, ZSCORE_DSET, SE_DSET, EFFECT_DSET, OTHER_DSET, FREQ_DSET} -TO_LOAD_DSET_HEADERS_DEFAULT = {SNP_DSET, RSID, PVAL_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET, - SE_DSET, EFFECT_DSET, OTHER_DSET, FREQ_DSET, HM_OR_DSET, HM_RANGE_L_DSET, HM_RANGE_U_DSET, HM_BETA_DSET, +TO_LOAD_DSET_HEADERS_DEFAULT = {SNP_DSET, RSID, PVAL_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET, ZSCORE_DSET, + SE_DSET, EFFECT_DSET, OTHER_DSET, FREQ_DSET, HM_OR_DSET, HM_RANGE_L_DSET, HM_RANGE_U_DSET, HM_BETA_DSET, HM_ZSCORE_DSET, HM_EFFECT_DSET, HM_OTHER_DSET, HM_FREQ_DSET, HM_CODE} TO_STORE_DSETS_DEFAULT = {SNP_DSET, RSID, MANTISSA_DSET, EXP_DSET, STUDY_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET, - BETA_DSET, SE_DSET, EFFECT_DSET, OTHER_DSET, FREQ_DSET, HM_OR_DSET, HM_RANGE_L_DSET, HM_RANGE_U_DSET, HM_BETA_DSET, + BETA_DSET, ZSCORE_DSET, SE_DSET, EFFECT_DSET, OTHER_DSET, FREQ_DSET, HM_OR_DSET, HM_RANGE_L_DSET, HM_RANGE_U_DSET, HM_BETA_DSET, HM_ZSCORE_DSET, HM_EFFECT_DSET, HM_OTHER_DSET, HM_FREQ_DSET, HM_VAR_ID, HM_CODE} -TO_QUERY_DSETS_DEFAULT = {SNP_DSET, RSID, MANTISSA_DSET, EXP_DSET, STUDY_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET, - SE_DSET, EFFECT_DSET, OTHER_DSET, FREQ_DSET, HM_OR_DSET, HM_RANGE_L_DSET, HM_RANGE_U_DSET, HM_BETA_DSET, +TO_QUERY_DSETS_DEFAULT = {SNP_DSET, RSID, MANTISSA_DSET, EXP_DSET, STUDY_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET, ZSCORE_DSET, + SE_DSET, EFFECT_DSET, OTHER_DSET, FREQ_DSET, HM_OR_DSET, HM_RANGE_L_DSET, HM_RANGE_U_DSET, HM_BETA_DSET, HM_ZSCORE_DSET, HM_EFFECT_DSET, HM_OTHER_DSET, HM_FREQ_DSET, HM_VAR_ID, HM_CODE} TO_INDEX = {SNP_DSET, RSID, PVAL_DSET, CHR_DSET, BP_DSET} REQUIRED = {CHR_DSET, PVAL_DSET, SNP_DSET}#, EFFECT_DSET, OTHER_DSET} @@ -64,6 +66,7 @@ RSID: "--rsid_col", BETA_DSET: "--beta_col", OR_DSET: "--or_col", + ZSCORE_DSET: "--zscore_col", RANGE_L_DSET: "--or_col_lower", RANGE_U_DSET: "--or_col_upper", FREQ_DSET: "--eaf_col", diff --git a/bin/lib/SumStatRecord.py b/bin/lib/SumStatRecord.py index 899dc697..a982957f 100755 --- a/bin/lib/SumStatRecord.py +++ b/bin/lib/SumStatRecord.py @@ -4,8 +4,8 @@ class SumStatRecord: """ Class to hold a summary statistic record. """ - def __init__(self, chrom, pos, other_al, effect_al, beta, oddsr, - oddsr_lower, oddsr_upper, eaf, rsid, data,hm_coordinate_conversion): + def __init__(self, chrom, pos, other_al, effect_al, beta, zscore, oddsr, + oddsr_lower, oddsr_upper, eaf, rsid, data, hm_coordinate_conversion): # Set raw info self.chrom = chrom @@ -15,6 +15,7 @@ def __init__(self, chrom, pos, other_al, effect_al, beta, oddsr, self.data = data self.beta = safe_float(beta) if beta is not None else None self.oddsr = safe_float(oddsr) if oddsr is not None else None + self.zscore = safe_float(zscore) if zscore is not None else None self.oddsr_lower = safe_float(oddsr_lower) if oddsr_lower is not None else None self.oddsr_upper = safe_float(oddsr_upper) if oddsr_upper is not None else None self.rsid = str(rsid) if rsid is not None else None @@ -77,6 +78,12 @@ def flip_beta(self): if self.beta: if self.beta != 0: self.beta = self.beta * -1 + + # Flip Z-score + if self.zscore: + if self.zscore != 0: + self.zscore = self.zscore * -1 + # Flip OR if self.oddsr: self.oddsr = self.oddsr ** -1 @@ -93,6 +100,7 @@ def flip_beta(self): # Flip eaf if self.eaf: self.eaf = 1 - self.eaf + #print(f"pos:{self.pos},beta:{self.beta}, OR:{self.oddsr},zscore:{self.zscore},eaf:{self.eaf}") def alleles(self): """ @@ -108,6 +116,7 @@ def __repr__(self): " other allele : " + str(self.other_al), " effect allele: " + str(self.effect_al), " beta : " + str(self.beta), + " Z-score : " + str(self.zscore), " odds ratio : " + str(self.oddsr), " EAF : " + str(self.eaf) ]) diff --git a/bin/log_script.sh b/bin/log_script.sh index 88a633a0..adf29db1 100755 --- a/bin/log_script.sh +++ b/bin/log_script.sh @@ -3,7 +3,7 @@ helpFunction() { echo -e "" - echo -e "Usage: $0 -r reference -i input -c count -d removed -h harmonized -u unmapped -q qc -s script -o output\n" + echo -e "Usage: $0 -r reference -i input -c count -d removed -h harmonized -u unmapped -q qc -s script -o output -p pipeline_version\n" echo -e "\t-r Reference data\n" echo -e "\t-i input raw data\n" echo -e "\t-c Total_strand_count\n" @@ -11,10 +11,11 @@ helpFunction() echo -e "\t-h Harmonization result\n" echo -e "\t-u unmapped sites file\n" echo -e "\t-o Output file\n" + echo -e "\t-p Pipeline version\n" exit 1 # Exit script after printing help } -while getopts "r:i:c:d:h:u:o:" opt +while getopts "r:i:c:d:h:u:o:p:" opt do case "$opt" in r ) reference="$OPTARG" ;; @@ -24,14 +25,15 @@ do h ) harmonized="$OPTARG" ;; u ) unmapped="$OPTARG" ;; o ) output="$OPTARG" ;; + p ) version="$OPTARG" ;; ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent esac done -echo -e $reference,$input,$count,$removed,$harmonized,$unmapped,$qc,$script,$output +echo -e $reference,$input,$count,$removed,$harmonized,$unmapped,$qc,$script,$output,$version # Print helpFunction in case parameters are empty -if [ -z "$reference" ] || [ -z "$input" ] || [ -z "$count" ] || [ -z "$removed" ] || [ -z "$harmonized" ] || [ -z "$unmapped" ] || [ -z "$output" ] +if [ -z "$reference" ] || [ -z "$input" ] || [ -z "$count" ] || [ -z "$removed" ] || [ -z "$harmonized" ] || [ -z "$unmapped" ] || [ -z "$output" ] || [ -z "$version" ] then echo -e "Some or all of the parameters are empty"; helpFunction @@ -48,7 +50,7 @@ HARMONISATION RUNNING REPORT\n echo -e " 1. Pipeline details\n - A. Pipeline Version: 0.1.0\n + A. Pipeline Version: $version\n B. Running date: $(date | awk '{print $2,$3,$6}')\n C. Input file: $(basename $input)\n ################################################################\n\n @@ -84,7 +86,7 @@ echo -e '4. Palindromic SNPs\n\npalin_mode: '$palin_mode'\n' >> $output ratio=$(grep ratio $count); number=$(echo -e $ratio | awk '{print $2}') if [ $palin_mode = "drop" ]; then -if [ ! $number]; then echo -e 'Palindromic SNPs could not be harmonized because the direction of palindromic SNPs cannot be inferred from consensus direction.\n'>> $output; +if [ ! $number ]; then echo -e 'Palindromic SNPs could not be harmonized because the direction of palindromic SNPs cannot be inferred from consensus direction.\n'>> $output; else echo -e 'Palindromic SNPs could not be harmonized because the direction of palindromic SNPs cannot be inferred from consensus direction (forward sites ratio ='$number').\n'>> $output; fi elif [[ $ratio =~ "Full" ]]; then diff --git a/bin/main_pysam.py b/bin/main_pysam.py index 847db311..8ae31496 100755 --- a/bin/main_pysam.py +++ b/bin/main_pysam.py @@ -41,7 +41,6 @@ def main(): # Process each row in summary statistics for counter, ss_rec in enumerate(yield_sum_stat_records(args.sumstats, args.in_sep)): - # If set to only process 1 chrom, skip none matching chroms if args.only_chrom and not args.only_chrom == ss_rec.chrom: continue @@ -148,7 +147,7 @@ def main(): # # Write ssrec to output ------------------------------------------------ # - + print("ss_rec.zscore:",ss_rec.zscore) if args.hm_sumstats: out_raw = OrderedDict() out_raw["chromosome"] = ss_rec.hm_chrom if vcf_rec and ss_rec.is_harmonised else args.na_rep_out @@ -157,6 +156,7 @@ def main(): out_raw["other_allele"] = ss_rec.hm_other_al.str() if vcf_rec and ss_rec.is_harmonised else args.na_rep_out out_raw["beta"] = ss_rec.beta if ss_rec.beta is not None and ss_rec.is_harmonised else args.na_rep_out out_raw["odds_ratio"] = ss_rec.oddsr if ss_rec.oddsr is not None and ss_rec.is_harmonised else args.na_rep_out + out_raw["zscore"] = ss_rec.zscore if ss_rec.zscore is not None and ss_rec.is_harmonised else args.na_rep_out out_raw["ci_lower"] = ss_rec.oddsr_lower if ss_rec.oddsr_lower is not None and ss_rec.is_harmonised else args.na_rep_out out_raw["ci_upper"] = ss_rec.oddsr_upper if ss_rec.oddsr_upper is not None and ss_rec.is_harmonised else args.na_rep_out out_raw["effect_allele_frequency"] = ss_rec.eaf if ss_rec.eaf is not None and ss_rec.is_harmonised else args.na_rep_out @@ -174,7 +174,7 @@ def main(): except: out_raw["standard_error"]=args.na_rep_out # Add other data from summary stat file - outed=["chromosome","base_pair_location","p_value","effect_allele","other_allele","effect_allele_frequency","beta","odds_ratio","rsid","standard_error","ci_upper","ci_lower","hm_coordinate_conversion"] + outed=["chromosome","base_pair_location","p_value","effect_allele","other_allele","effect_allele_frequency","beta","odds_ratio","rsid","standard_error","ci_upper","ci_lower","hm_coordinate_conversion","zscore"] for key in ss_rec.data: if key not in outed: value = ss_rec.data[key] if ss_rec.data[key] else args.na_rep_out @@ -183,10 +183,8 @@ def main(): generated_new_header=["hm_code","variant_id","rsid"] add_header=[x for x in generated_new_header if x not in out_header] new_order=out_header+add_header - out_row = OrderedDict((k, out_raw[k]) for k in new_order) - # Write header if not header_written: outline = args.out_sep.join([str(x) for x in out_row.keys()]) + "\n" @@ -301,6 +299,8 @@ def parse_args(): help=('Other allele column'), type=str, required=True) incols_group.add_argument('--beta_col', metavar="", help=('beta column'), type=str) + incols_group.add_argument('--zscore_col', metavar="", + help=('Z-score column'), type=str) incols_group.add_argument('--or_col', metavar="", help=('Odds ratio column'), type=str) incols_group.add_argument('--or_col_lower', metavar="", @@ -716,6 +716,7 @@ def yield_sum_stat_records(inf, sep): row[args.otherAl_col], row[args.effAl_col], row.get(args.beta_col, None), + row.get(args.zscore_col, None), row.get(args.or_col, None), row.get(args.or_col_lower, None), row.get(args.or_col_upper, None), diff --git a/environments/conda_environment.yml b/environments/conda_environment.yml index 17991252..662c3331 100644 --- a/environments/conda_environment.yml +++ b/environments/conda_environment.yml @@ -16,7 +16,7 @@ dependencies: - docutils==0.16 - gitdb==4.0.5 - GitPython==3.1.2 - - gwas-sumstats-tools==0.1.3a0 + - gwas-sumstats-tools==1.0.22 - idna==2.9 - importlib-metadata==1.6.0 - ipython-genutils==0.2.0 diff --git a/modules/local/harmonization_log.nf b/modules/local/harmonization_log.nf index 4705c9ed..8d3b6461 100644 --- a/modules/local/harmonization_log.nf +++ b/modules/local/harmonization_log.nf @@ -24,7 +24,8 @@ process harmonization_log { -d $delete_sites \ -h $all_hm \ -u $unmapped \ - -o ${GCST}.running.log + -o ${GCST}.running.log \ + -p ${params.version} N=\$(awk -v RS='\t' '/hm_code/{print NR; exit}' $qc_result) sed 1d $qc_result| awk -F "\t" '{print \$'"\$N"'}' | creat_log.py >> ${GCST}.running.log diff --git a/test_data/random_name.tsv b/test_data/random_name.tsv new file mode 100644 index 00000000..e5b80a0a --- /dev/null +++ b/test_data/random_name.tsv @@ -0,0 +1,5 @@ +chromosome base_pair_location effect_allele other_allele beta standard_error effect_allele_frequency p_value info rsid zscore odds_ratio +1 693731 C T 0.01 0.00806496 0.002779 0.1 ref_rs12238997_A_G NA 0.02 0.03 +1 935393 G GCCACGGG -0.01 0.00806496 0.997221 0.1 ref_rs1469404497_G_GCCACGGG_norsid_flipped NA -0.02 -0.03 +1 935475 GCG G -0.01 0.00806496 0.997221 0.1 ref_rs1014128468_CGC_C_norsid_flipped NA -0.02 -0.03 +22 16052962 T C -0.00477642 0.0164749 0.089851 0.77 ref_rs376238049_G_A NA 0.02 0.03 \ No newline at end of file diff --git a/test_data/rsID.sql b/test_data/rsID.sql deleted file mode 100644 index 65ec55d006d398277f050237d8bfb4096da3a705..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmeI&&ui2`6bJB3Ci`O|o98VD6{hsk1#vfaGMw16E(IK#l8MUx*4xUy^V)q5{08^7_aG$VZ1r8E?ny@u5{$( z<)zLonODhSEcaE)g;`hBtKKk+?x`9Zj%QEOsCVG>{Ez9UWq+AJ4hQ|}_DNQ9Yc;lg ze-z$X@6Vo@zwo{eGk?U&;`wgppx3sgZnN8>o{Zwrf>*CfxqM|sF0XdGr^hwf^RvRc z55i=jp;T2abuKQhb|*FCl4`a#w`tv=AM}|%&>MP1FX$;frY+rx1_1~_00Izz00bZa z0SG_<0uX?}KMRn}*}RjaemiKko61*?;B1a3X;5$bf#=noG8gkn>UphtI|w{gDRD8E zq^cdLpy{hhagw&`LCb3dPC?&YNYa*Ws-<&F=T@EvenW81Ts?cEp&U~WnR-b1XO(Ky z%Zv**8)@45D{M*)`bJ;qBfX{9^pc*@6MCdO(I5Z;2tWV=5P$##AOHafKmY;|II4id z=h$r3Uf7fBMZ1-!`9j^Cq|*%oF-fNz0;N37Hw22r=k?6H+Y7efuFm!PpO?8S#_Rn_ z$?gtAd9wcBG3Xclr0?`q?*e?%{{g(GceHcV55#>CfB*y_009U<00Izz00bZa0SNpj j0sS)obL=b@3R$$QEOIf9Jd37jaly)2C}p9@xe&hrb+*fs diff --git a/workflows/gwascatalogharm.nf b/workflows/gwascatalogharm.nf index 9e8fc8c6..43efa50b 100644 --- a/workflows/gwascatalogharm.nf +++ b/workflows/gwascatalogharm.nf @@ -12,13 +12,16 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ params.harm=null -if (params.harm) { - if (!params.harm) { println " ERROR: You didn't set any files to be harmonized \ Please set --harm and try again (: " System.exit(1) } + +if (!params.version) { + println " ERROR: Please specific the pipeline version you are running (e.g. v1.1.9) \ + Please set --version and try again (: " + System.exit(1) } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -86,7 +89,7 @@ workflow GWASCATALOGHARM { def input_files(Path input) { - return [(input.getName()=~ /GCST\d+/).findAll()[0],input+"-meta.yaml",input] + return [input.getName().split('\\.')[0],input+"-meta.yaml",input] } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/workflows/gwascatalogharm_gwascatalog.nf b/workflows/gwascatalogharm_gwascatalog.nf index 1a2288ef..9dacd276 100644 --- a/workflows/gwascatalogharm_gwascatalog.nf +++ b/workflows/gwascatalogharm_gwascatalog.nf @@ -25,6 +25,12 @@ if (!params.to_build & !params.chrom) { System.exit(1) } } + +if (!params.version) { + println " ERROR: Please specific the pipeline version you are running (e.g. v1.1.9) \ + Please set --version and try again (: " + System.exit(1) +} /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT LOCAL MODULES/SUBWORKFLOWS From 5351e0248bf7d42543fb4d042c06e3ecff0b3fe6 Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Tue, 29 Oct 2024 15:12:32 +0000 Subject: [PATCH 03/24] structure improvement 1. isolated executor config 2. rearrange the output folder structure. --- config/basic.config | 4 ++-- config/{lsf.config => default_params.config} | 0 config/executor.config | 5 +++++ nextflow.config | 8 +++----- test_data/GCST0.tsv | 2 -- test_data/GCST1.tsv | 2 -- 6 files changed, 10 insertions(+), 11 deletions(-) rename config/{lsf.config => default_params.config} (100%) create mode 100644 config/executor.config diff --git a/config/basic.config b/config/basic.config index 659e6e29..99ad7cc4 100644 --- a/config/basic.config +++ b/config/basic.config @@ -60,7 +60,7 @@ process { withName:concatenate_chr_splits { publishDir =[ - path:{"${launchDir}/$GCST/final"}, + path:{"${launchDir}/$GCST/qc"}, mode: 'copy' ] } @@ -69,7 +69,7 @@ process { memory = { 10.GB * task.attempt } time = { 5.h * task.attempt } publishDir =[ - path:{"${launchDir}/$GCST/final"}, + path:{"${launchDir}/$GCST/qc"}, mode: 'copy' ] } diff --git a/config/lsf.config b/config/default_params.config similarity index 100% rename from config/lsf.config rename to config/default_params.config diff --git a/config/executor.config b/config/executor.config new file mode 100644 index 00000000..27389799 --- /dev/null +++ b/config/executor.config @@ -0,0 +1,5 @@ +process{ + executor = 'slurm' + memory = '28GB' + time = '2d' +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index efcb553c..5ef431a9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -65,7 +65,7 @@ if (!params.harm & !params.gwascatalog) { if (params.harm) { if (!params.reference & !params.gwascatalog) { - includeConfig 'config/lsf.config' + includeConfig 'config/default_params.config' if (params.chromlist){ params.chrom = params.chromlist?.tokenize(',') as List } @@ -86,10 +86,8 @@ profiles { standard { process.executor = 'local' } - cluster { - process.executor = 'slurm' - process.memory = '28GB' - process.time = '2d' + executor { + includeConfig './config/executor.config' } conda { params.enable_conda = true diff --git a/test_data/GCST0.tsv b/test_data/GCST0.tsv index 90371a96..9e9c95a6 100644 --- a/test_data/GCST0.tsv +++ b/test_data/GCST0.tsv @@ -1,7 +1,5 @@ chromosome base_pair_location effect_allele other_allele beta standard_error effect_allele_frequency p_value info rsid 1 693730 A G -0.016619 0.00806496 0.997221 0.1 ref_rs12238997 NA -1 935393 GCCACGGG G -0.016619 0.00806496 0.997221 0.1 ref_rs1469404497 NA 1 935393 G GCCACGGG -0.016619 0.00806496 0.997221 0.1 ref_rs1469404497 rs1469404497 1 935474 CGC C -0.016619 0.00806496 0.997221 0.1 ref_rs1014128468 NA -1 935474 CGC C -0.016619 0.00806496 0.997221 0.1 ref_rs1014128468 rs1014128468 22 16052961 T C -0.00477642 0.0164749 0.089851 0.77 ref_rs376238049 NA \ No newline at end of file diff --git a/test_data/GCST1.tsv b/test_data/GCST1.tsv index 7c948e48..44c9baa4 100644 --- a/test_data/GCST1.tsv +++ b/test_data/GCST1.tsv @@ -1,7 +1,5 @@ chromosome base_pair_location effect_allele other_allele beta standard_error effect_allele_frequency p_value info rsid 1 693731 A G -0.016619 0.00806496 0.997221 0.1 ref_rs12238997 NA 1 935393 G GCCACGGG -0.016619 0.00806496 0.997221 0.1 ref_rs1469404497 NA -1 935393 G GCCACGGG -0.016619 0.00806496 0.997221 0.1 ref_rs1469404497 rs1469404497 -1 935475 CGC C -0.016619 0.00806496 0.997221 0.1 ref_rs1014128468 NA 1 935475 CGC C -0.016619 0.00806496 0.997221 0.1 ref_rs1014128468 rs1014128468 22 16052962 T C -0.00477642 0.0164749 0.089851 0.77 ref_rs376238049 NA \ No newline at end of file From b832341e717f50d977abd54930404ee9a8cf643a Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Tue, 29 Oct 2024 15:23:19 +0000 Subject: [PATCH 04/24] change test input data --- config/default_params.config | 1 - config/test.config | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/config/default_params.config b/config/default_params.config index eafb187f..6af1d60f 100644 --- a/config/default_params.config +++ b/config/default_params.config @@ -1,6 +1,5 @@ params { to_build='38' chrom=['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','X','Y','MT'] - ref='' threshold='0.99' } diff --git a/config/test.config b/config/test.config index 1150619d..4719f6c8 100644 --- a/config/test.config +++ b/config/test.config @@ -4,7 +4,7 @@ params { ref=System.getenv('NXF_ASSETS') ? "$NXF_ASSETS/EBISPOT/gwas-sumstats-harmoniser/test_data" : "$HOME/.nextflow/assets/EBISPOT/gwas-sumstats-harmoniser/test_data" threshold='0.99' harm = true - file= System.getenv('NXF_ASSETS') ? "$NXF_ASSETS/EBISPOT/gwas-sumstats-harmoniser/test_data/GCST1.tsv" : "$HOME/.nextflow/assets/EBISPOT/gwas-sumstats-harmoniser/test_data/GCST1.tsv" + file= System.getenv('NXF_ASSETS') ? "$NXF_ASSETS/EBISPOT/gwas-sumstats-harmoniser/test_data/GCST1.tsv" : "$HOME/.nextflow/assets/EBISPOT/gwas-sumstats-harmoniser/test_data/random_name.tsv" } process{ From 4a9cf4402fd76f6b7eca169bd1bea7d10420bbc2 Mon Sep 17 00:00:00 2001 From: jiyue1214 <52689284+jiyue1214@users.noreply.github.com> Date: Tue, 29 Oct 2024 15:26:54 +0000 Subject: [PATCH 05/24] Create random_name.tsv-meta.yaml --- test_data/random_name.tsv-meta.yaml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 test_data/random_name.tsv-meta.yaml diff --git a/test_data/random_name.tsv-meta.yaml b/test_data/random_name.tsv-meta.yaml new file mode 100644 index 00000000..8e101aba --- /dev/null +++ b/test_data/random_name.tsv-meta.yaml @@ -0,0 +1,15 @@ +# Study meta-data +date_metadata_last_modified: 2023-02-09 + +# Genotyping Information +genome_assembly: GRCh37 +coordinate_system: 1-based + +# Summary Statistic information +data_file_name: GCST1.tsv +file_type: GWAS-SSF v0.1 +data_file_md5sum: 32ce41c3dca4cd9f463a0ce7351966fd + +# Harmonization status +is_harmonised: false +is_sorted: false From 4d8112ab4b6f493cedf6358fd62a1f642f9e3893 Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Tue, 29 Oct 2024 15:32:49 +0000 Subject: [PATCH 06/24] Update test.config test on chr 1 and chr 22 since there are limited variants in the test file. --- config/test.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/test.config b/config/test.config index 4719f6c8..308e4ca7 100644 --- a/config/test.config +++ b/config/test.config @@ -1,8 +1,8 @@ params { to_build='38' - chrom='22' + chrom=['1','22'] ref=System.getenv('NXF_ASSETS') ? "$NXF_ASSETS/EBISPOT/gwas-sumstats-harmoniser/test_data" : "$HOME/.nextflow/assets/EBISPOT/gwas-sumstats-harmoniser/test_data" - threshold='0.99' + threshold='0.5' harm = true file= System.getenv('NXF_ASSETS') ? "$NXF_ASSETS/EBISPOT/gwas-sumstats-harmoniser/test_data/GCST1.tsv" : "$HOME/.nextflow/assets/EBISPOT/gwas-sumstats-harmoniser/test_data/random_name.tsv" } From 49f03b48d9001bf4e4df31de2e426f382926378a Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Tue, 29 Oct 2024 16:49:19 +0000 Subject: [PATCH 07/24] updates 1. allow the user to choose to terminate whenever an error occurs 2. check all parameters in the main.nf 3. reorganise the output folder. --- config/basic.config | 4 -- config/exit_error.config | 5 ++ config/ignore_error.config | 5 ++ main.nf | 59 ++++++++++++++++++++---- modules/local/harmonization_log.nf | 6 +++ modules/local/qc.nf | 6 --- nextflow.config | 8 +++- workflows/gwascatalogharm.nf | 11 ----- workflows/gwascatalogharm_gwascatalog.nf | 20 -------- 9 files changed, 74 insertions(+), 50 deletions(-) create mode 100644 config/exit_error.config create mode 100644 config/ignore_error.config diff --git a/config/basic.config b/config/basic.config index 99ad7cc4..62ce91d5 100644 --- a/config/basic.config +++ b/config/basic.config @@ -4,10 +4,6 @@ process { memory = { 1.GB * task.attempt } time = { 1.h * task.attempt } - errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'ignore' } - maxRetries = 5 - maxErrors = '-1' - withName:map_to_build { memory = { 28.GB * task.attempt } time = { 5.h * task.attempt } diff --git a/config/exit_error.config b/config/exit_error.config new file mode 100644 index 00000000..8e7ad6cf --- /dev/null +++ b/config/exit_error.config @@ -0,0 +1,5 @@ +process { + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'terminate' } + maxRetries = 5 + maxErrors = '-1' +} \ No newline at end of file diff --git a/config/ignore_error.config b/config/ignore_error.config new file mode 100644 index 00000000..989d80ba --- /dev/null +++ b/config/ignore_error.config @@ -0,0 +1,5 @@ +process { + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'ignore' } + maxRetries = 5 + maxErrors = '-1' +} \ No newline at end of file diff --git a/main.nf b/main.nf index 6ae6f14b..9e8292f3 100644 --- a/main.nf +++ b/main.nf @@ -28,25 +28,68 @@ include { GWASCATALOGHARM } from './workflows/gwascatalogharm' // WORKFLOW: Run main nf-core/gwascatalogharm analysis pipeline // workflow NFCORE_GWASCATALOGHARM { + + // Check mandatory parameters params.reference = null params.gwascatalog = null params.harm = null - if(params.reference){ - println ("Prepare the reference ...") - PREPARE_REFERENCE() + if (!params.to_build) { + println "ERROR: You didn't set the target build to harmonise to" + println "Please set --to_build 38" + System.exit(1) } - else if(params.gwascatalog){ - println ("Harmonizing files in the folder ${params.all_harm_folder}") - GWASCATALOGHARM_GWASCATALOG() + if (!params.chrom) { + println "ERROR: You didn't set chromsomes to be harmnnised" + println "Please set --chrom 22 or --chromlist 22,X,Y or set chrom in ./config/default_params.config " + System.exit(1) } - else if(params.harm){ - GWASCATALOGHARM() + if (!params.threshold) { + println "ERROR: You didn't set threshold to imput the direction of palindromic variants" + println "Please set --threshold 0.99 or set threshold in ./config/default_params.config " + System.exit(1) } + + if (!params.version) { + println " ERROR: Please specific the pipeline version you are running (e.g. v1.1.9) \ + Please set --version and try again (: " + System.exit(1) + } + + // check conditinal input parameters + + if (params.reference) { + println ("Prepare the reference ...") + PREPARE_REFERENCE() + } else if (params.gwascatalog) { + if (!params.to_harm_folder) { + println " ERROR: You didn't set any folder to be harmonised \ + Please set --to_harm_folder and try again (: " + System.exit(1) + } else { + println ("Harmonizing files in the folder ${params.all_harm_folder}") + GWASCATALOGHARM_GWASCATALOG() + } + } else if (params.harm) { + if (!params.file && !params.list) { + println " ERROR: You didn't set any files to be harmonised \ + Please set --file for a single input file or \ + set --list for a list containing all files are waiting to be harmonised \ + and try again (: " + System.exit(1) + } else { + println ("Start harmonising files") + GWASCATALOGHARM() + } + } else { + println " ERROR: You didn't set any model to run the pipeline \ + Please set --harm and try again (: " + System.exit(1) + } } /* diff --git a/modules/local/harmonization_log.nf b/modules/local/harmonization_log.nf index 8d3b6461..1f8eefea 100644 --- a/modules/local/harmonization_log.nf +++ b/modules/local/harmonization_log.nf @@ -17,6 +17,12 @@ process harmonization_log { shell: """ + chr=\$(awk -v RS='\t' '/chromosome/{print NR; exit}' $qc_result) + pos=\$(awk -v RS='\t' '/base_pair_location/{print NR; exit}' $qc_result) + + cat $qc_result | bgzip -c > ${launchDir}/$GCST/final/${GCST}.h.tsv.gz + tabix -c N -S 1 -f -s \$chr -b \$pos -e \$pos ${launchDir}/$GCST/final/${GCST}.h.tsv.gz + log_script.sh \ -r "${params.ref}/homo_sapiens-${chr}.vcf.gz" \ -i $input \ diff --git a/modules/local/qc.nf b/modules/local/qc.nf index fb7a4f6f..483a5422 100644 --- a/modules/local/qc.nf +++ b/modules/local/qc.nf @@ -23,11 +23,5 @@ process qc { -o harmonised.qc.tsv \ --log report.txt \ -db ${params.ref}/rsID.sql - - chr=\$(awk -v RS='\t' '/chromosome/{print NR; exit}' harmonised.qc.tsv) - pos=\$(awk -v RS='\t' '/base_pair_location/{print NR; exit}' harmonised.qc.tsv) - - cat harmonised.qc.tsv | bgzip -c > ${launchDir}/$GCST/final/${GCST}.h.tsv.gz - tabix -c N -S 1 -f -s \$chr -b \$pos -e \$pos ${launchDir}/$GCST/final/${GCST}.h.tsv.gz """ } diff --git a/nextflow.config b/nextflow.config index 5ef431a9..635b58b7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -53,10 +53,15 @@ manifest { mainScript = 'main.nf' } +if (params.terminate_error) { + includeConfig './config/exit_error.config' +} else { + includeConfig './config/ignore_error.config' +} if (params.reference) { if (!params.harm & !params.gwascatalog) { - includeConfig 'config/reference.config' + includeConfig './config/reference.config' if (params.chromlist){ params.chrom = params.chromlist?.tokenize(',') as List } @@ -66,6 +71,7 @@ if (!params.harm & !params.gwascatalog) { if (params.harm) { if (!params.reference & !params.gwascatalog) { includeConfig 'config/default_params.config' + includeConfig './config/basic.config' if (params.chromlist){ params.chrom = params.chromlist?.tokenize(',') as List } diff --git a/workflows/gwascatalogharm.nf b/workflows/gwascatalogharm.nf index 43efa50b..ebc9bc02 100644 --- a/workflows/gwascatalogharm.nf +++ b/workflows/gwascatalogharm.nf @@ -11,18 +11,7 @@ CONFIG FILES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -params.harm=null -if (!params.harm) { - println " ERROR: You didn't set any files to be harmonized \ - Please set --harm and try again (: " - System.exit(1) -} -if (!params.version) { - println " ERROR: Please specific the pipeline version you are running (e.g. v1.1.9) \ - Please set --version and try again (: " - System.exit(1) -} /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT LOCAL MODULES/SUBWORKFLOWS diff --git a/workflows/gwascatalogharm_gwascatalog.nf b/workflows/gwascatalogharm_gwascatalog.nf index 9dacd276..3cf388c8 100644 --- a/workflows/gwascatalogharm_gwascatalog.nf +++ b/workflows/gwascatalogharm_gwascatalog.nf @@ -11,26 +11,6 @@ CONFIG FILES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -params.to_harm_folder=null -if (params.to_harm_folder) { -if (!params.inputPath & !params.to_harm_folder) { - println " ERROR: You didn't set any folder to be harmonized \ - Please set --to_harm_folder and --inputPath and try again (: " - System.exit(1) -} - -if (!params.to_build & !params.chrom) { - println "ERROR: You didn't set the target build and chromsomes to be harmnnized" - println "Please set --to_build 38 or --chrom ['1','2',...]" - System.exit(1) -} -} - -if (!params.version) { - println " ERROR: Please specific the pipeline version you are running (e.g. v1.1.9) \ - Please set --version and try again (: " - System.exit(1) -} /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT LOCAL MODULES/SUBWORKFLOWS From 1712d2b35a261baad73f35dab80a14d6a290606a Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Tue, 29 Oct 2024 16:52:50 +0000 Subject: [PATCH 08/24] terminate_error parameter --- nextflow.config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nextflow.config b/nextflow.config index 635b58b7..f873ea2d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,6 +10,9 @@ params { +// error strategy parameter +terminate_error = null + // global parameter chromlist = null chrom = null From 85290afdff8249d0d38cf15a9bb7e2a6deeea86a Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Tue, 29 Oct 2024 17:04:02 +0000 Subject: [PATCH 09/24] excuator should only run when excuator available --- nextflow.config | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/nextflow.config b/nextflow.config index f873ea2d..b0a0f1d9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -56,12 +56,6 @@ manifest { mainScript = 'main.nf' } -if (params.terminate_error) { - includeConfig './config/exit_error.config' -} else { - includeConfig './config/ignore_error.config' -} - if (params.reference) { if (!params.harm & !params.gwascatalog) { includeConfig './config/reference.config' @@ -97,6 +91,11 @@ profiles { } executor { includeConfig './config/executor.config' + if (params.terminate_error) { + includeConfig './config/exit_error.config' + } else { + includeConfig './config/ignore_error.config' + } } conda { params.enable_conda = true From 4b2252d01c8d10dc16f5d8a0cd4e88794f0eb920 Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Tue, 29 Oct 2024 17:43:38 +0000 Subject: [PATCH 10/24] isolate yaml update step --- config/test.config | 20 ++++++++++++-- modules/local/harmonization_log.nf | 2 +- modules/local/update_meta_yaml.nf | 40 +++++++++++++++++++++++++++ subworkflows/local/quality_control.nf | 5 ++-- 4 files changed, 62 insertions(+), 5 deletions(-) create mode 100644 modules/local/update_meta_yaml.nf diff --git a/config/test.config b/config/test.config index 308e4ca7..4e554add 100644 --- a/config/test.config +++ b/config/test.config @@ -13,7 +13,7 @@ process{ memory = { 3.GB * task.attempt } time = { 3.h * task.attempt } publishDir =[ - path:{"${launchDir}/$GCST"}, + path:{"${launchDir}/$GCST/map_to_build"}, mode: 'copy' ] } @@ -59,11 +59,18 @@ process{ ] } + withName:concatenate_chr_splits { + publishDir =[ + path:{"${launchDir}/$GCST"}, + mode: 'copy' + ] + } + withName:qc { memory = { 3.GB * task.attempt } time = { 1.h * task.attempt } publishDir =[ - path:{"${launchDir}/$GCST/final"}, + path:{"${launchDir}/$GCST/qc"}, mode: 'copy' ] } @@ -76,4 +83,13 @@ process{ mode: 'copy' ] } + + withName:update_meta_yaml { + memory = { 2.GB * task.attempt } + time = { 1.h * task.attempt } + publishDir =[ + path:{"${launchDir}/$GCST/final"}, + mode: 'copy' + ] + } } diff --git a/modules/local/harmonization_log.nf b/modules/local/harmonization_log.nf index 1f8eefea..deb5eec3 100644 --- a/modules/local/harmonization_log.nf +++ b/modules/local/harmonization_log.nf @@ -13,7 +13,7 @@ process harmonization_log { tuple val(GCST), val(mode), path(all_hm), path(qc_result), path(delete_sites), path(count), path(raw_yaml), path(input), path(unmapped) output: - tuple val(GCST), path(qc_result), path ("${GCST}.running.log"), path ("${GCST}.h.tsv.gz-meta.yaml"), env(result), emit: running_result + tuple val(GCST), path(raw_yaml), path(qc_result), path ("${GCST}.running.log"), env(result) shell: """ diff --git a/modules/local/update_meta_yaml.nf b/modules/local/update_meta_yaml.nf new file mode 100644 index 00000000..1c8be7cd --- /dev/null +++ b/modules/local/update_meta_yaml.nf @@ -0,0 +1,40 @@ +process update_meta_yaml { + tag "$GCST" + + conda (params.enable_conda ? "${task.ext.conda}" : null) + + container "${ workflow.containerEngine == 'singularity' && + !task.ext.singularity_pull_docker_container ? + "${task.ext.singularity}${task.ext.singularity_version}" : + "${task.ext.docker}${task.ext.docker_version}" }" + + input: + tuple val(GCST),path(raw_yaml), path(qc_result), path ("${GCST}.running.log"), env(result) + + output: + tuple val(GCST), path(qc_result), path ("${GCST}.running.log"), path ("${GCST}.h.tsv.gz-meta.yaml"), env(result), emit: running_result + + shell: + """ + # metadata file + + data_file_name="${GCST}.h.tsv.gz" + out_yaml="${GCST}.h.tsv.gz-meta.yaml" + data_file_md5sum=\$(md5sum<${launchDir}/$GCST/final/${GCST}.h.tsv.gz | awk '{print \$1}') + date_metadata_last_modified=\$(date +"%Y-%m-%d") + harmonisation_reference=\$(tabix -H "${params.ref}/homo_sapiens-${chr}.vcf.gz" | grep reference | cut -f2 -d '=') + + gwas_metadata.py \ + -i $raw_yaml \ + -o \$out_yaml \ + -e \ + --data_file_name \$data_file_name \ + --data_file_md5sum \$data_file_md5sum \ + --is_harmonised True \ + --is_sorted True \ + --genome_assembly GRCh38 \ + --coordinate_system 1-based \ + --date_metadata_last_modified \$date_metadata_last_modified \ + --harmonisation_reference \$harmonisation_reference \ + """ +} \ No newline at end of file diff --git a/subworkflows/local/quality_control.nf b/subworkflows/local/quality_control.nf index bf27657f..2d52b212 100644 --- a/subworkflows/local/quality_control.nf +++ b/subworkflows/local/quality_control.nf @@ -1,9 +1,9 @@ include {qc} from '../../modules/local/qc' include {harmonization_log} from '../../modules/local/harmonization_log' +include {update_meta_yaml} from '../../modules/local/update_meta_yaml' /*-------- module from nf-core -----------------*/ //include {bgzip} from '../../modules/nf-core/modules/tabix/bgzip/main' -//include {tabix} from '../../modules/nf-core/modules/tabix/tabix/main' workflow quality_control{ take: @@ -20,7 +20,8 @@ workflow quality_control{ input_log=ch_to_log.combine(unmapped,by:0) def to_log=chroms.flatten().last() harmonization_log(to_log,input_log) + update_meta_yaml(harmonization_log.out) emit: - qclog=harmonization_log.out.running_result + qclog=update_meta_yaml.out.running_result } From 67198bb9e71b3ec99982dddb43c1d809a09ede9f Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Tue, 29 Oct 2024 18:41:59 +0000 Subject: [PATCH 11/24] modify the channel to suite the nre output structure --- modules/local/harmonization_log.nf | 34 +++++++----------------------- modules/local/update_meta_yaml.nf | 4 ++-- 2 files changed, 10 insertions(+), 28 deletions(-) diff --git a/modules/local/harmonization_log.nf b/modules/local/harmonization_log.nf index deb5eec3..81533bc0 100644 --- a/modules/local/harmonization_log.nf +++ b/modules/local/harmonization_log.nf @@ -13,16 +13,11 @@ process harmonization_log { tuple val(GCST), val(mode), path(all_hm), path(qc_result), path(delete_sites), path(count), path(raw_yaml), path(input), path(unmapped) output: - tuple val(GCST), path(raw_yaml), path(qc_result), path ("${GCST}.running.log"), env(result) + tuple val(chr), val(GCST), path(raw_yaml), path("${GCST}.h.tsv.gz"), path("${GCST}.h.tsv.gz.tbi"), path(qc_result), path ("${GCST}.running.log"), env(result) shell: """ - chr=\$(awk -v RS='\t' '/chromosome/{print NR; exit}' $qc_result) - pos=\$(awk -v RS='\t' '/base_pair_location/{print NR; exit}' $qc_result) - - cat $qc_result | bgzip -c > ${launchDir}/$GCST/final/${GCST}.h.tsv.gz - tabix -c N -S 1 -f -s \$chr -b \$pos -e \$pos ${launchDir}/$GCST/final/${GCST}.h.tsv.gz - + # Generating running log log_script.sh \ -r "${params.ref}/homo_sapiens-${chr}.vcf.gz" \ -i $input \ @@ -36,27 +31,14 @@ process harmonization_log { N=\$(awk -v RS='\t' '/hm_code/{print NR; exit}' $qc_result) sed 1d $qc_result| awk -F "\t" '{print \$'"\$N"'}' | creat_log.py >> ${GCST}.running.log + # extract harmonise result result=\$(grep Result ${GCST}.running.log | cut -f2) - # metadata file - - data_file_name="${GCST}.h.tsv.gz" - out_yaml="${GCST}.h.tsv.gz-meta.yaml" - data_file_md5sum=\$(md5sum<${launchDir}/$GCST/final/${GCST}.h.tsv.gz | awk '{print \$1}') - date_metadata_last_modified=\$(date +"%Y-%m-%d") - harmonisation_reference=\$(tabix -H "${params.ref}/homo_sapiens-${chr}.vcf.gz" | grep reference | cut -f2 -d '=') + # Prepare the gzip data + chr=\$(awk -v RS='\t' '/chromosome/{print NR; exit}' $qc_result) + pos=\$(awk -v RS='\t' '/base_pair_location/{print NR; exit}' $qc_result) - gwas_metadata.py \ - -i $raw_yaml \ - -o \$out_yaml \ - -e \ - --data_file_name \$data_file_name \ - --data_file_md5sum \$data_file_md5sum \ - --is_harmonised True \ - --is_sorted True \ - --genome_assembly GRCh38 \ - --coordinate_system 1-based \ - --date_metadata_last_modified \$date_metadata_last_modified \ - --harmonisation_reference \$harmonisation_reference \ + cat $qc_result | bgzip -c > ${GCST}.h.tsv.gz + tabix -c N -S 1 -f -s \$chr -b \$pos -e \$pos ${GCST}.h.tsv.gz """ } \ No newline at end of file diff --git a/modules/local/update_meta_yaml.nf b/modules/local/update_meta_yaml.nf index 1c8be7cd..2a8ff5c3 100644 --- a/modules/local/update_meta_yaml.nf +++ b/modules/local/update_meta_yaml.nf @@ -9,7 +9,7 @@ process update_meta_yaml { "${task.ext.docker}${task.ext.docker_version}" }" input: - tuple val(GCST),path(raw_yaml), path(qc_result), path ("${GCST}.running.log"), env(result) + tuple val(chr), val(GCST), path(raw_yaml), path(zip_harm) , path(zip_harm_tbi) , path(qc_result), path (running_log), env(result) output: tuple val(GCST), path(qc_result), path ("${GCST}.running.log"), path ("${GCST}.h.tsv.gz-meta.yaml"), env(result), emit: running_result @@ -20,7 +20,7 @@ process update_meta_yaml { data_file_name="${GCST}.h.tsv.gz" out_yaml="${GCST}.h.tsv.gz-meta.yaml" - data_file_md5sum=\$(md5sum<${launchDir}/$GCST/final/${GCST}.h.tsv.gz | awk '{print \$1}') + data_file_md5sum=\$(md5sum<$zip_harm | awk '{print \$1}') date_metadata_last_modified=\$(date +"%Y-%m-%d") harmonisation_reference=\$(tabix -H "${params.ref}/homo_sapiens-${chr}.vcf.gz" | grep reference | cut -f2 -d '=') From e90020035123a82f69926599b02a2888ce63eae0 Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Tue, 29 Oct 2024 19:04:46 +0000 Subject: [PATCH 12/24] improve output structure --- config/test.config | 16 ++++++---------- modules/local/harmonization_log.nf | 2 +- modules/local/update_meta_yaml.nf | 6 +++--- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/config/test.config b/config/test.config index 4e554add..63bb47a5 100644 --- a/config/test.config +++ b/config/test.config @@ -13,7 +13,7 @@ process{ memory = { 3.GB * task.attempt } time = { 3.h * task.attempt } publishDir =[ - path:{"${launchDir}/$GCST/map_to_build"}, + path:{"${launchDir}/$GCST/1_map_to_build"}, mode: 'copy' ] } @@ -22,7 +22,7 @@ process{ memory = { 3.GB * task.attempt } time = { 3.h * task.attempt } publishDir =[ - path:{"${launchDir}/$GCST/ten_sc"}, + path:{"${launchDir}/$GCST/2_ten_sc"}, mode: 'copy' ] } @@ -38,7 +38,7 @@ process{ memory = { 3.GB * task.attempt } time = { 3.h * task.attempt } publishDir =[ - path:{"${launchDir}/$GCST/all_sc"}, + path:{"${launchDir}/$GCST/3_all_sc"}, mode: 'copy' ] } @@ -54,14 +54,14 @@ process{ memory = { 3.GB * task.attempt } time = { 3.h * task.attempt } publishDir =[ - path:{"${launchDir}/$GCST/harmonization"}, + path:{"${launchDir}/$GCST/4_harmonization"}, mode: 'copy' ] } withName:concatenate_chr_splits { publishDir =[ - path:{"${launchDir}/$GCST"}, + path:{"${launchDir}/$GCST/5_qc"}, mode: 'copy' ] } @@ -70,7 +70,7 @@ process{ memory = { 3.GB * task.attempt } time = { 1.h * task.attempt } publishDir =[ - path:{"${launchDir}/$GCST/qc"}, + path:{"${launchDir}/$GCST/5_qc"}, mode: 'copy' ] } @@ -78,10 +78,6 @@ process{ withName:harmonization_log { memory = { 3.GB * task.attempt } time = { 1.h * task.attempt } - publishDir =[ - path:{"${launchDir}/$GCST/final"}, - mode: 'copy' - ] } withName:update_meta_yaml { diff --git a/modules/local/harmonization_log.nf b/modules/local/harmonization_log.nf index 81533bc0..839e8e88 100644 --- a/modules/local/harmonization_log.nf +++ b/modules/local/harmonization_log.nf @@ -13,7 +13,7 @@ process harmonization_log { tuple val(GCST), val(mode), path(all_hm), path(qc_result), path(delete_sites), path(count), path(raw_yaml), path(input), path(unmapped) output: - tuple val(chr), val(GCST), path(raw_yaml), path("${GCST}.h.tsv.gz"), path("${GCST}.h.tsv.gz.tbi"), path(qc_result), path ("${GCST}.running.log"), env(result) + tuple val(chr), val(GCST), path(raw_yaml), path("${GCST}.h.tsv.gz"), path("${GCST}.h.tsv.gz.tbi"), path ("${GCST}.running.log"), env(result) shell: """ diff --git a/modules/local/update_meta_yaml.nf b/modules/local/update_meta_yaml.nf index 2a8ff5c3..57613639 100644 --- a/modules/local/update_meta_yaml.nf +++ b/modules/local/update_meta_yaml.nf @@ -9,10 +9,10 @@ process update_meta_yaml { "${task.ext.docker}${task.ext.docker_version}" }" input: - tuple val(chr), val(GCST), path(raw_yaml), path(zip_harm) , path(zip_harm_tbi) , path(qc_result), path (running_log), env(result) - + tuple val(chr), val(GCST), path(raw_yaml), path(zip_harm) , path(zip_harm_tbi), path (running_log), env(result) + output: - tuple val(GCST), path(qc_result), path ("${GCST}.running.log"), path ("${GCST}.h.tsv.gz-meta.yaml"), env(result), emit: running_result + tuple val(GCST), path(zip_harm) , path(zip_harm_tbi), path (running_log), path ("${GCST}.h.tsv.gz-meta.yaml"), env(result), emit: running_result shell: """ From f74e7307c67af6103845ac131af8c79b43809379 Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Tue, 29 Oct 2024 19:17:54 +0000 Subject: [PATCH 13/24] new output structure in basic.config --- config/basic.config | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/config/basic.config b/config/basic.config index 62ce91d5..0b674d00 100644 --- a/config/basic.config +++ b/config/basic.config @@ -8,7 +8,7 @@ process { memory = { 28.GB * task.attempt } time = { 5.h * task.attempt } publishDir =[ - path:{"${launchDir}/$GCST"}, + path:{"${launchDir}/$GCST/1_map_to_build"}, mode: 'copy' ] } @@ -17,7 +17,7 @@ process { memory = { 10.GB * task.attempt } time = { 5.h * task.attempt } publishDir =[ - path:{"${launchDir}/$GCST/ten_sc"}, + path:{"${launchDir}/$GCST/2_ten_sc"}, mode: 'copy' ] } @@ -33,7 +33,7 @@ process { memory = { 5.GB * task.attempt } time = { 3.h * task.attempt } publishDir =[ - path:{"${launchDir}/$GCST/all_sc"}, + path:{"${launchDir}/$GCST/3_all_sc"}, mode: 'copy' ] } @@ -49,14 +49,14 @@ process { memory = { 5.GB * task.attempt } time = { 3.h * task.attempt } publishDir =[ - path:{"${launchDir}/$GCST/harmonization"}, + path:{"${launchDir}/$GCST/4_harmonization"}, mode: 'copy' ] } withName:concatenate_chr_splits { publishDir =[ - path:{"${launchDir}/$GCST/qc"}, + path:{"${launchDir}/$GCST/5_qc"}, mode: 'copy' ] } @@ -65,7 +65,7 @@ process { memory = { 10.GB * task.attempt } time = { 5.h * task.attempt } publishDir =[ - path:{"${launchDir}/$GCST/qc"}, + path:{"${launchDir}/$GCST/5_qc"}, mode: 'copy' ] } @@ -73,7 +73,12 @@ process { withName:harmonization_log { memory = { 10.GB * task.attempt } time = { 3.h * task.attempt } - publishDir =[ + } + + withName:update_meta_yaml { + memory = { 10.GB * task.attempt } + time = { 3.h * task.attempt } + publishDir =[ path:{"${launchDir}/$GCST/final"}, mode: 'copy' ] From a4a5c3357119c105225a993d86100594d73280ed Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Tue, 29 Oct 2024 22:25:00 +0000 Subject: [PATCH 14/24] update the ftp move channel --- modules/local/failed_copy.nf | 17 +++++++++++++---- modules/local/ftp_copy.nf | 16 ++++++++++------ subworkflows/local/move_files.nf | 4 ++-- subworkflows/local/quality_control.nf | 1 + workflows/gwascatalogharm_gwascatalog.nf | 2 +- 5 files changed, 27 insertions(+), 13 deletions(-) diff --git a/modules/local/failed_copy.nf b/modules/local/failed_copy.nf index 63c79f83..e7580c45 100644 --- a/modules/local/failed_copy.nf +++ b/modules/local/failed_copy.nf @@ -7,7 +7,7 @@ process failed_copy { "${task.ext.docker}${task.ext.docker_version}" }" input: - tuple val(GCST), path(raw_yaml), path(tsv), path(qc_tsv), path (log), path (yaml), val(status) + tuple val(GCST), path(raw_yaml), path(tsv), path(htsv), path(tbi), path(running_log), path(yaml), val(status) output: tuple val(GCST),val(status), env(copy), emit: done @@ -17,12 +17,21 @@ process failed_copy { shell: """ + if [[ $GCST =~ ^GCST[0-9]+$ ]]; then + folder=\$(accession_id.sh -n $GCST) + path=${params.ftp}/\$folder/$GCST/harmonised/ + else + path=${params.ftp}/$GCST + fi - log_file=${launchDir}/$GCST/final/${GCST}.running.log + if [ ! -d \$path ] + then + mkdir -p \$path + fi - if [[ -f \$log_file ]] + if [[ -f $running_log ]] then - cp ${launchDir}/$GCST/final/${GCST}.running.log ${params.failed}/ + cp $running_log ${params.ftp}/ fi copy="copied" diff --git a/modules/local/ftp_copy.nf b/modules/local/ftp_copy.nf index f4c028ae..eaeae59a 100644 --- a/modules/local/ftp_copy.nf +++ b/modules/local/ftp_copy.nf @@ -6,8 +6,8 @@ process ftp_copy{ input: - tuple val(GCST), path(raw_yaml), path(tsv), path(qc_tsv), path(running_log), path(yaml), val(status) - + tuple val(GCST), path(raw_yaml), path(tsv), path(htsv), path(tbi), path(running_log), path(yaml), val(status) + //[GCST1, yaml,tsv,h.tsv.gz,h.tsv.gz.tbi, running.log,h.tsv.gz-meta.yaml, SUCCESS_HARMONIZATION] output: tuple val(GCST), val(status), env(copy), emit: done @@ -16,21 +16,25 @@ process ftp_copy{ shell: """ - folder=\$(accession_id.sh -n $GCST) - path=${params.ftp}/\$folder/$GCST/harmonised/ + if [[ $GCST =~ ^GCST[0-9]+$ ]]; then + folder=\$(accession_id.sh -n $GCST) + path=${params.ftp}/\$folder/$GCST/harmonised/ + else + path=${params.ftp}/$GCST + fi if [ ! -d \$path ] then mkdir -p \$path fi - cp ${launchDir}/$GCST/final/${GCST}.h.tsv.gz \$path + cp $htsv \$path md5_h_tsv=\$(md5sum<${launchDir}/$GCST/final/${GCST}.h.tsv.gz | awk '{print \$1}') md5_h_tsv_copied=\$(md5sum<\$path/${GCST}.h.tsv.gz | awk '{print \$1}') md5_h_tbi=\$(md5sum<${launchDir}/$GCST/final/${GCST}.h.tsv.gz.tbi | awk '{print \$1}') - cp ${launchDir}/$GCST/final/${GCST}.h.tsv.gz.tbi \$path + cp $tbi \$path cp $running_log \$path cp $yaml \$path diff --git a/subworkflows/local/move_files.nf b/subworkflows/local/move_files.nf index 7cae28cc..892c3645 100644 --- a/subworkflows/local/move_files.nf +++ b/subworkflows/local/move_files.nf @@ -10,8 +10,8 @@ workflow move_files{ har_result_ch=input.branch{success:it.contains("SUCCESS_HARMONIZATION") failed:it.contains("FAILED_HARMONIZATION")} //success_harmonized_file move to FTP - //failed harmonized file move to Failed folder - //har_result_ch.success.view(): [GCST1, yaml,tsv,qc.tsv,running.log,h.tsv.gz-meta.yaml, SUCCESS_HARMONIZATION] + //failed harmonized file only log moved to FTP + //har_result_ch.success.view(): [GCST1, yaml,tsv,h.tsv.gz,h.tsv.gz.tbi, running.log,h.tsv.gz-meta.yaml, SUCCESS_HARMONIZATION] ftp_copy(har_result_ch.success) failed_copy(har_result_ch.failed) //return channlel: [GCST,SUCCESS_HARMONIZATION,copied],[GCST,SUCCESS_HARMONIZATION,failed_copied],[GCST,SUCCESS_HARMONIZATION,failed_copy] diff --git a/subworkflows/local/quality_control.nf b/subworkflows/local/quality_control.nf index 2d52b212..dd73d447 100644 --- a/subworkflows/local/quality_control.nf +++ b/subworkflows/local/quality_control.nf @@ -24,4 +24,5 @@ workflow quality_control{ emit: qclog=update_meta_yaml.out.running_result + // qc.log: val(GCST),path(zip_harm),path(zip_harm_tbi),path(running_log),path ("${GCST}.h.tsv.gz-meta.yaml"),env(result) } diff --git a/workflows/gwascatalogharm_gwascatalog.nf b/workflows/gwascatalogharm_gwascatalog.nf index 3cf388c8..e3eea2fc 100644 --- a/workflows/gwascatalogharm_gwascatalog.nf +++ b/workflows/gwascatalogharm_gwascatalog.nf @@ -66,7 +66,7 @@ workflow GWASCATALOGHARM_GWASCATALOG { quality_control(main_harm.out.hm,major_direction.out.direction_sum,ch_files,ch_direction,major_direction.out.unmapped) harmonnized_ch=quality_control.out.qclog all_files_ch=ch_files.join(harmonnized_ch,remainder: true) - //example:[GCST90029037, 37, path *.tsv, path qc.tsv, path GCST90029037.running.log, SUCCESS_HARMONIZATION] + //example:[GCST90029037,raw_yaml, path input.tsv, path h.tsv.gz, path h.tsv.gz.tbi path GCST90029037.running.log, path h.tsv.gz-meta.yaml, SUCCESS_HARMONIZATION] move_files(all_files_ch) //[GCST009150, SUCCESS_HARMONIZATION, copied] From 7af2c92c5734ab390dee34c74ad19eca914e79c5 Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Thu, 31 Oct 2024 14:44:49 +0000 Subject: [PATCH 15/24] update parameters 1. set default the version parameter 2. two sets to recognise the input file name --- config/default_params.config | 1 + config/gwascatalog.config | 11 ----------- config/test.config | 1 + modules/local/harmonization.nf | 2 +- subworkflows/local/prepare_reference.nf | 4 ++-- workflows/gwascatalogharm.nf | 12 +++++++++++- 6 files changed, 16 insertions(+), 15 deletions(-) diff --git a/config/default_params.config b/config/default_params.config index 6af1d60f..ccfa7953 100644 --- a/config/default_params.config +++ b/config/default_params.config @@ -2,4 +2,5 @@ params { to_build='38' chrom=['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','X','Y','MT'] threshold='0.99' + version='v1.1.10' } diff --git a/config/gwascatalog.config b/config/gwascatalog.config index b0faa886..ef0e341a 100644 --- a/config/gwascatalog.config +++ b/config/gwascatalog.config @@ -1,14 +1,3 @@ -params { - to_build='38' - threshold='0.99' - chrom=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y', 'MT'] - - ref='' - all_harm_folder='' - ftp='' - failed='' - } - process { errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'ignore' } maxRetries = 5 diff --git a/config/test.config b/config/test.config index 63bb47a5..d53f652f 100644 --- a/config/test.config +++ b/config/test.config @@ -4,6 +4,7 @@ params { ref=System.getenv('NXF_ASSETS') ? "$NXF_ASSETS/EBISPOT/gwas-sumstats-harmoniser/test_data" : "$HOME/.nextflow/assets/EBISPOT/gwas-sumstats-harmoniser/test_data" threshold='0.5' harm = true + version = 'v1.1.10' file= System.getenv('NXF_ASSETS') ? "$NXF_ASSETS/EBISPOT/gwas-sumstats-harmoniser/test_data/GCST1.tsv" : "$HOME/.nextflow/assets/EBISPOT/gwas-sumstats-harmoniser/test_data/random_name.tsv" } diff --git a/modules/local/harmonization.nf b/modules/local/harmonization.nf index 3f4d83d9..b953eaf5 100644 --- a/modules/local/harmonization.nf +++ b/modules/local/harmonization.nf @@ -41,6 +41,6 @@ process harmonization { pos=\$(awk -v RS='\t' '/base_pair_location/{print NR; exit}' ${chrom}.merged_unsorted.hm) head -n1 ${chrom}.merged_unsorted.hm > ${chrom}.merged.hm; - tail -n+2 ${chrom}.merged_unsorted.hm | sort -n -k\$chr -k\$pos >> ${chrom}.merged.hm + tail -n+2 ${chrom}.merged_unsorted.hm | sort -n -k\$chr -k\$pos -T\$PWD >> ${chrom}.merged.hm """ } diff --git a/subworkflows/local/prepare_reference.nf b/subworkflows/local/prepare_reference.nf index c02bacf5..6e1fca02 100644 --- a/subworkflows/local/prepare_reference.nf +++ b/subworkflows/local/prepare_reference.nf @@ -11,6 +11,6 @@ workflow prepare_reference { // output of make_parquet_refs tuple: [chr, vcf, tbi, parquet] - get_variation_tables(params.remote_ensembl_variation) - make_local_synonyms_table(get_variation_tables.out.var,get_variation_tables.out.syn) + // get_variation_tables(params.remote_ensembl_variation) + // make_local_synonyms_table(get_variation_tables.out.var,get_variation_tables.out.syn) } \ No newline at end of file diff --git a/workflows/gwascatalogharm.nf b/workflows/gwascatalogharm.nf index ebc9bc02..2ff29046 100644 --- a/workflows/gwascatalogharm.nf +++ b/workflows/gwascatalogharm.nf @@ -78,7 +78,17 @@ workflow GWASCATALOGHARM { def input_files(Path input) { - return [input.getName().split('\\.')[0],input+"-meta.yaml",input] + def baseName = input.getName().split("\\.")[0] + + // Check if input name matches the pattern GCST[0-9]+ + if (baseName ==~ /GCST\d+/) { + // Extract GCST ID using regex find + def gcstId = (baseName =~ /GCST\d+/).findAll()[0] + return [gcstId, input + "-meta.yaml", input] + } else { + // Default case + return [baseName, input + "-meta.yaml", input] + } } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 1ee832612a0e0b264d15c6952667e82987df17b9 Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Thu, 31 Oct 2024 17:29:53 +0000 Subject: [PATCH 16/24] count folder changed --- config/gwascatalog.config | 5 +---- modules/local/failed_copy.nf | 2 +- modules/local/ftp_copy.nf | 2 +- modules/local/summarise_strand_counts.nf | 2 +- modules/local/ten_percent_counts_sum.nf | 2 +- 5 files changed, 5 insertions(+), 8 deletions(-) diff --git a/config/gwascatalog.config b/config/gwascatalog.config index ef0e341a..eb496015 100644 --- a/config/gwascatalog.config +++ b/config/gwascatalog.config @@ -1,8 +1,5 @@ process { - errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'ignore' } - maxRetries = 5 - maxErrors = '-1' - + withName:failed_copy { memory = { 5.GB * task.attempt } time = { 1.h * task.attempt } diff --git a/modules/local/failed_copy.nf b/modules/local/failed_copy.nf index e7580c45..dd7667bd 100644 --- a/modules/local/failed_copy.nf +++ b/modules/local/failed_copy.nf @@ -17,7 +17,7 @@ process failed_copy { shell: """ - if [[ $GCST =~ ^GCST[0-9]+$ ]]; then + if [[ $GCST =~ ^GCST[0-9]+ ]]; then folder=\$(accession_id.sh -n $GCST) path=${params.ftp}/\$folder/$GCST/harmonised/ else diff --git a/modules/local/ftp_copy.nf b/modules/local/ftp_copy.nf index eaeae59a..cb408a53 100644 --- a/modules/local/ftp_copy.nf +++ b/modules/local/ftp_copy.nf @@ -16,7 +16,7 @@ process ftp_copy{ shell: """ - if [[ $GCST =~ ^GCST[0-9]+$ ]]; then + if [[ $GCST =~ ^GCST[0-9]+ ]]; then folder=\$(accession_id.sh -n $GCST) path=${params.ftp}/\$folder/$GCST/harmonised/ else diff --git a/modules/local/summarise_strand_counts.nf b/modules/local/summarise_strand_counts.nf index 7902bdef..732c5dcb 100644 --- a/modules/local/summarise_strand_counts.nf +++ b/modules/local/summarise_strand_counts.nf @@ -20,7 +20,7 @@ process summarise_strand_counts { shell: """ sum_strand_counts_nf.py \ - -i ${launchDir}/$GCST/all_sc \ + -i ${launchDir}/$GCST/3_all_sc \ -o total_strand_count.tsv \ -t ${params.threshold} diff --git a/modules/local/ten_percent_counts_sum.nf b/modules/local/ten_percent_counts_sum.nf index 5b6fc029..dc66e70e 100644 --- a/modules/local/ten_percent_counts_sum.nf +++ b/modules/local/ten_percent_counts_sum.nf @@ -17,7 +17,7 @@ process ten_percent_counts_sum { shell: """ sum_strand_counts_10percent_nf.py \ - -i ${launchDir}/$GCST/ten_sc \ + -i ${launchDir}/$GCST/2_ten_sc \ -o ten_percent_total_strand_count.tsv \ -t ${params.threshold} From e14d3b897a51ac012f0ea87d295f9bc7c5003c28 Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Fri, 1 Nov 2024 14:16:34 +0000 Subject: [PATCH 17/24] improve log and test file 1. improve the log 2. change zscore to z_score 3. improve the test file 4. delete no more need --- bin/common_constants.py | 2 +- bin/creat_log.py | 4 +- bin/fill_NA_for_hm_code15_AMPspecific.py | 90 ------------------------ bin/log_script.sh | 66 +++++++++-------- bin/main_pysam.py | 4 +- config/basic.config | 5 ++ main.nf | 35 ++++----- modules/local/get_vcf_files.nf | 2 +- nextflow.config | 6 +- subworkflows/local/prepare_reference.nf | 7 +- test_data/GCST1.tsv | 10 +-- test_data/random_name.tsv | 10 +-- 12 files changed, 75 insertions(+), 166 deletions(-) delete mode 100755 bin/fill_NA_for_hm_code15_AMPspecific.py diff --git a/bin/common_constants.py b/bin/common_constants.py index ea9fbbd4..b3f94fc4 100755 --- a/bin/common_constants.py +++ b/bin/common_constants.py @@ -9,7 +9,7 @@ RANGE_U_DSET = 'ci_upper' RANGE_L_DSET = 'ci_lower' BETA_DSET = 'beta' -ZSCORE_DSET = 'zscore' +ZSCORE_DSET = 'z_score' RSID = 'rsid' SE_DSET = 'standard_error' EFFECT_DSET = 'effect_allele' diff --git a/bin/creat_log.py b/bin/creat_log.py index 94c53d6e..3d90fc4e 100755 --- a/bin/creat_log.py +++ b/bin/creat_log.py @@ -48,7 +48,7 @@ count=success.iloc[i,0] per=success.iloc[i,1] print(key,count,"{0:.2%}".format(per),code_table[key],sep="\t") -print("\n################################################################\n\n") +print("\n################################################################\n") # Failed harmonized variants print("\n6. Failed harmonisation\n") @@ -73,7 +73,7 @@ for key, count in hm_code_fail_dict.items(): per = count/all print(key,count,"{0:.2%}".format(per),code_table[key],sep="\t") -print("\n################################################################\n\n") +print("\n################################################################\n") print("\n7. Overview\n") diff --git a/bin/fill_NA_for_hm_code15_AMPspecific.py b/bin/fill_NA_for_hm_code15_AMPspecific.py deleted file mode 100755 index bab667ab..00000000 --- a/bin/fill_NA_for_hm_code15_AMPspecific.py +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# In[97]: - - -import pandas as pd -import os -import glob -import argparse -import numpy as np -from copy import deepcopy -from subprocess import Popen, PIPE -from collections import OrderedDict, Counter - - -# In[147]: - - -def complement(s): - basecomplement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'} - letters = list(s) - letters = [basecomplement[base] for base in letters] - return ''.join(letters) -def revcom(s): - return complement(s[::-1]) - - -# In[ ]: - - -def fill_hm_15(ss,palin_mode,outfile): - ssdf = pd.read_csv(ss, sep='\t', dtype=str) - print("read done") - # function 1: test the hm_code and whether this site is an indel (length) - # ssdf["hm_code"]=="15" - # ssdf["effect_allele"].str.len()+ssdf["other_allele"].str.len()>=3 - - # function 2: fill required column with the raw data (direction) - ssdf["hm_rsid"]=np.where((ssdf["hm_code"]=="15")&(ssdf["effect_allele"].str.len()+ssdf["other_allele"].str.len()>=3),ssdf["variant_id"],ssdf["hm_rsid"]) - print("rsid done") - ssdf["hm_chrom"]=np.where((ssdf["hm_code"]=="15")&(ssdf["effect_allele"].str.len()+ssdf["other_allele"].str.len()>=3),ssdf["chromosome"],ssdf["hm_chrom"]) - print("chrom done") - ssdf["hm_pos"]=np.where((ssdf["hm_code"]=="15")&(ssdf["effect_allele"].str.len()+ssdf["other_allele"].str.len()>=3),ssdf["base_pair_location"],ssdf["hm_pos"]) - print("pos done") - ssdf["hm_beta"]=np.where((ssdf["hm_code"]=="15")&(ssdf["effect_allele"].str.len()+ssdf["other_allele"].str.len()>=3),ssdf["beta"],ssdf["hm_beta"]) - print("beta done") - ssdf["hm_odds_ratio"]=np.where((ssdf["hm_code"]=="15")&(ssdf["effect_allele"].str.len()+ssdf["other_allele"].str.len()>=3),ssdf["odds_ratio"],ssdf["hm_odds_ratio"]) - print("or done") - - if palin_mode == 'forward': - ssdf["hm_other_allele"]=np.where((ssdf["hm_code"]=="15")&(ssdf["effect_allele"].str.len()+ssdf["other_allele"].str.len()>=3),ssdf["other_allele"],ssdf["hm_other_allele"]) - ssdf["hm_effect_allele"]=np.where((ssdf["hm_code"]=="15")&(ssdf["effect_allele"].str.len()+ssdf["other_allele"].str.len()>=3),ssdf["effect_allele"],ssdf["hm_effect_allele"]) - ssdf["hm_code"]=np.where((ssdf["hm_code"]=="15")&(ssdf["effect_allele"].str.len()+ssdf["other_allele"].str.len()>=3),16,ssdf["hm_code"]) - # Add new code hm_code=16, which changed from 15, means alleles in the data do not match reference alleles, but they are indels, so keep all raw data ifnormation here. - elif palin_mode == 'reverse': - print("reverse_mode") - ssdf["hm_other_allele"]=np.where((ssdf["hm_code"]=="15")&(ssdf["effect_allele"].str.len()+ssdf["other_allele"].str.len()>=3),ssdf['other_allele'].apply(revcom),ssdf["hm_other_allele"]) - print("re-other done") - ssdf["hm_effect_allele"]=np.where((ssdf["hm_code"]=="15")&(ssdf["effect_allele"].str.len()+ssdf["other_allele"].str.len()>=3),ssdf['effect_allele'].apply(revcom),ssdf["hm_effect_allele"]) - print("re-effect done") - ssdf["hm_code"]=np.where((ssdf["hm_code"]=="15")&(ssdf["effect_allele"].str.len()+ssdf["other_allele"].str.len()>=3),17,ssdf["hm_code"]) - print("code done") - # Add new code hm_code=17, which changed from 15, means alleles in the data do not match reference alleles, but they are indels, so keep revcomp allele of raw data at here. - - ssdf.to_csv(outfile, sep="\t", index=False, na_rep="NA") - - - - -# In[ ]: - - -def main(): - argparser = argparse.ArgumentParser() - argparser.add_argument('-f', help='The name of the file to be processed', required=True) - argparser.add_argument('-p', help='The major direction of the data', required=True) - argparser.add_argument('-o', help='output name', required=True) - args = argparser.parse_args() - - ss = args.f - palin_mode = args.p - outfile=args.o - - fill_hm_15(ss,palin_mode,outfile) - - -if __name__ == "__main__": - main() - diff --git a/bin/log_script.sh b/bin/log_script.sh index adf29db1..78f8ce01 100755 --- a/bin/log_script.sh +++ b/bin/log_script.sh @@ -41,59 +41,57 @@ fi # Begin script in case all parameters are correct -# PIPELINE META +# PIPELINE META, REFERENCE, MAPPING -echo -e "################################################################\n -HARMONISATION RUNNING REPORT\n -################################################################\n\n -" > $output +UNMAPPED_SITES=$(tail -n+2 $unmapped | wc -l) +MAPPED_SITES=$(tail -n+2 $harmonized | wc -l) +TOTAL_SITES=$(($UNMAPPED_SITES + $MAPPED_SITES)) +UNMAPPED_RATE=$(awk "BEGIN {print $UNMAPPED_SITES/$TOTAL_SITES*100}") +MAPPED_RATE=$(awk "BEGIN {print $MAPPED_SITES/$TOTAL_SITES*100}") -echo -e " +printf "################################################################\n +HARMONISATION RUNNING REPORT\n +################################################################\n 1. Pipeline details\n A. Pipeline Version: $version\n B. Running date: $(date | awk '{print $2,$3,$6}')\n C. Input file: $(basename $input)\n ################################################################\n\n -" >> $output - -# REFERENCE - -echo -e " 2. Reference data\n $(tabix -H $reference | grep source)\n $(tabix -H $reference | grep reference)\n $(tabix -H $reference | grep dbSNP | sed 's/INFO=//g')\n -################################################################\n\n -" >> $output +################################################################\n\n" > $output -# MAPPING - -UNMAPPED_SITES=$(tail -n+2 $unmapped | wc -l) -MAPPED_SITES=$(tail -n+2 $harmonized | wc -l) -TOTAL_SITES=$(($UNMAPPED_SITES + $MAPPED_SITES)) - -#TODO: add the number of rs vs liftover -echo -e ' -3. Mapping result\n\n'$(awk "BEGIN {print $UNMAPPED_SITES/$TOTAL_SITES*100}")'% ('$UNMAPPED_SITES' sites out of '$TOTAL_SITES') were dropped because they could not be mapped. \n'$(awk "BEGIN {print $MAPPED_SITES/$TOTAL_SITES*100}")'% ('$MAPPED_SITES' sites) were carried forward.\n - -################################################################\n\n -' >> $output +printf "3. Mapping result\n\n%.2f%% (%d sites out of %d) were dropped because they could not be mapped.\n%.2f%% (%d sites) were carried forward.\n" \ + "$UNMAPPED_RATE" "$UNMAPPED_SITES" "$TOTAL_SITES" \ + "$MAPPED_RATE" "$MAPPED_SITES" >> $output # PALIN MODE palin_mode=$(grep palin_mode $count | cut -f2); -echo -e '4. Palindromic SNPs\n\npalin_mode: '$palin_mode'\n' >> $output +printf "\n################################################################\n +4. Palindromic SNPs\n +palin_mode: $palin_mode\n" >> $output + ratio=$(grep ratio $count); number=$(echo -e $ratio | awk '{print $2}') + if [ $palin_mode = "drop" ]; then -if [ ! $number ]; then echo -e 'Palindromic SNPs could not be harmonized because the direction of palindromic SNPs cannot be inferred from consensus direction.\n'>> $output; -else echo -e 'Palindromic SNPs could not be harmonized because the direction of palindromic SNPs cannot be inferred from consensus direction (forward sites ratio ='$number').\n'>> $output; -fi -elif [[ $ratio =~ "Full" ]]; then -echo -e 'Direction of palindromic SNPs inferred as '$palin_mode' by establishing consensus direction of all sites (forward sites ratio ='$number').\n'>> $output; -elif [[ $ratio =~ "10_percent" ]]; then -echo -e 'Direction of palindromic SNPs inferred as '$palin_mode' by establishing consensus direction of 10% of all sites (forward sites ratio ='$number').\n'>> $output; + if [ ! $number ]; then + printf 'Palindromic SNPs could not be harmonized because the direction of palindromic SNPs cannot be inferred from consensus direction.\n'>> $output; + else + printf 'Palindromic SNPs could not be harmonized because the direction of palindromic SNPs cannot be inferred from consensus direction (forward sites ratio ='$number').\n'>> $output; + fi +else + if [[ $ratio =~ "Full" ]]; then + printf "Direction of palindromic SNPs inferred as %s by establishing consensus direction of all sites (forward sites ratio = %.4f).\n" "$palin_mode" "$number" >> "$output" + elif [[ $ratio =~ "10_percent" ]]; then + printf "Direction of palindromic SNPs inferred as %s by establishing consensus direction of 10% of all sites (forward sites ratio = %.4f).\n" "$palin_mode" "$number" >> "$output" + else + printf "Direction of palindromic SNPs inferred as %s by establishing consensus direction (forward sites ratio = %.4f).\n" "$palin_mode" "$number" >> "$output" + fi fi -echo -e '################################################################\n\n' >> $output +printf "\n################################################################\n\n" >> $output diff --git a/bin/main_pysam.py b/bin/main_pysam.py index 8ae31496..7d1632a4 100755 --- a/bin/main_pysam.py +++ b/bin/main_pysam.py @@ -156,7 +156,7 @@ def main(): out_raw["other_allele"] = ss_rec.hm_other_al.str() if vcf_rec and ss_rec.is_harmonised else args.na_rep_out out_raw["beta"] = ss_rec.beta if ss_rec.beta is not None and ss_rec.is_harmonised else args.na_rep_out out_raw["odds_ratio"] = ss_rec.oddsr if ss_rec.oddsr is not None and ss_rec.is_harmonised else args.na_rep_out - out_raw["zscore"] = ss_rec.zscore if ss_rec.zscore is not None and ss_rec.is_harmonised else args.na_rep_out + out_raw["z_score"] = ss_rec.zscore if ss_rec.zscore is not None and ss_rec.is_harmonised else args.na_rep_out out_raw["ci_lower"] = ss_rec.oddsr_lower if ss_rec.oddsr_lower is not None and ss_rec.is_harmonised else args.na_rep_out out_raw["ci_upper"] = ss_rec.oddsr_upper if ss_rec.oddsr_upper is not None and ss_rec.is_harmonised else args.na_rep_out out_raw["effect_allele_frequency"] = ss_rec.eaf if ss_rec.eaf is not None and ss_rec.is_harmonised else args.na_rep_out @@ -174,7 +174,7 @@ def main(): except: out_raw["standard_error"]=args.na_rep_out # Add other data from summary stat file - outed=["chromosome","base_pair_location","p_value","effect_allele","other_allele","effect_allele_frequency","beta","odds_ratio","rsid","standard_error","ci_upper","ci_lower","hm_coordinate_conversion","zscore"] + outed=["chromosome","base_pair_location","p_value","effect_allele","other_allele","effect_allele_frequency","beta","odds_ratio","rsid","standard_error","ci_upper","ci_lower","hm_coordinate_conversion","z_score"] for key in ss_rec.data: if key not in outed: value = ss_rec.data[key] if ss_rec.data[key] else args.na_rep_out diff --git a/config/basic.config b/config/basic.config index 0b674d00..2b050075 100644 --- a/config/basic.config +++ b/config/basic.config @@ -4,6 +4,11 @@ process { memory = { 1.GB * task.attempt } time = { 1.h * task.attempt } + withName:get_vcf_files { + memory = { 20.GB * task.attempt } + time = { 5.h * task.attempt } + } + withName:map_to_build { memory = { 28.GB * task.attempt } time = { 5.h * task.attempt } diff --git a/main.nf b/main.nf index 9e8292f3..770e1938 100644 --- a/main.nf +++ b/main.nf @@ -35,29 +35,30 @@ workflow NFCORE_GWASCATALOGHARM { params.gwascatalog = null params.harm = null - if (!params.to_build) { - println "ERROR: You didn't set the target build to harmonise to" - println "Please set --to_build 38" - System.exit(1) - } - if (!params.chrom) { println "ERROR: You didn't set chromsomes to be harmnnised" println "Please set --chrom 22 or --chromlist 22,X,Y or set chrom in ./config/default_params.config " System.exit(1) } - if (!params.threshold) { - println "ERROR: You didn't set threshold to imput the direction of palindromic variants" - println "Please set --threshold 0.99 or set threshold in ./config/default_params.config " - System.exit(1) - } - - - if (!params.version) { - println " ERROR: Please specific the pipeline version you are running (e.g. v1.1.9) \ - Please set --version and try again (: " - System.exit(1) + if (!params.reference) { + if (!params.to_build) { + println "ERROR: You didn't set the target build to harmonise to" + println "Please set --to_build 38" + System.exit(1) + } + + if (!params.threshold) { + println "ERROR: You didn't set threshold to imput the direction of palindromic variants" + println "Please set --threshold 0.99 or set threshold in ./config/default_params.config " + System.exit(1) + } + + if (!params.version) { + println " ERROR: Please specific the pipeline version you are running (e.g. v1.1.9) \ + Please set --version and try again (: " + System.exit(1) + } } // check conditinal input parameters diff --git a/modules/local/get_vcf_files.nf b/modules/local/get_vcf_files.nf index e133dba0..d4a6da81 100644 --- a/modules/local/get_vcf_files.nf +++ b/modules/local/get_vcf_files.nf @@ -1,5 +1,6 @@ /* download reference */ process get_vcf_files { + tag "${chr}" conda (params.enable_conda ? "${task.ext.conda}" : null) container "${ workflow.containerEngine == 'singularity' && @@ -8,7 +9,6 @@ process get_vcf_files { "${task.ext.docker}${task.ext.docker_version}" }" storeDir params.ref - errorStrategy = { 'ignore' } input: val chr diff --git a/nextflow.config b/nextflow.config index b0a0f1d9..d57287e1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -93,9 +93,9 @@ profiles { includeConfig './config/executor.config' if (params.terminate_error) { includeConfig './config/exit_error.config' - } else { - includeConfig './config/ignore_error.config' - } + } else{ + includeConfig './config/ignore_error.config' + } } conda { params.enable_conda = true diff --git a/subworkflows/local/prepare_reference.nf b/subworkflows/local/prepare_reference.nf index 6e1fca02..d2cc9e0a 100644 --- a/subworkflows/local/prepare_reference.nf +++ b/subworkflows/local/prepare_reference.nf @@ -1,16 +1,11 @@ // if reference files are not exist, download and prepare reference include {get_vcf_files} from '../../modules/local/get_vcf_files' -include {get_variation_tables} from '../../modules/local/get_variation_tables' -include {make_local_synonyms_table} from '../../modules/local/make_local_synonyms_table' - workflow prepare_reference { take: in_chrom main: + get_vcf_files(in_chrom) // output of make_parquet_refs tuple: [chr, vcf, tbi, parquet] - - // get_variation_tables(params.remote_ensembl_variation) - // make_local_synonyms_table(get_variation_tables.out.var,get_variation_tables.out.syn) } \ No newline at end of file diff --git a/test_data/GCST1.tsv b/test_data/GCST1.tsv index 44c9baa4..3a1a4bdc 100644 --- a/test_data/GCST1.tsv +++ b/test_data/GCST1.tsv @@ -1,5 +1,5 @@ -chromosome base_pair_location effect_allele other_allele beta standard_error effect_allele_frequency p_value info rsid -1 693731 A G -0.016619 0.00806496 0.997221 0.1 ref_rs12238997 NA -1 935393 G GCCACGGG -0.016619 0.00806496 0.997221 0.1 ref_rs1469404497 NA -1 935475 CGC C -0.016619 0.00806496 0.997221 0.1 ref_rs1014128468 rs1014128468 -22 16052962 T C -0.00477642 0.0164749 0.089851 0.77 ref_rs376238049 NA \ No newline at end of file +chromosome base_pair_location effect_allele other_allele beta standard_error effect_allele_frequency p_value info rsid z_score odds_ratio ci_lower ci_upper +1 693731 G A 0.1 0.2 0.3 0.4 ref_rs12238997_forward_noflip NA 0.5 0.7 0.6 0.8 +1 1 CCCGTGGC C 0.1 0.2 0.3 0.4 ref_rs1469404497_reverse_noflip rs1469404497 0.5 0.7 0.6 0.8 +1 935475 CGC C -0.1 0.2 0.7 0.4 ref_rs1014128468_forward_flipped NA -0.5 1.4 1.25 1.67 +22 16052962 C T -0.1 0.2 0.7 0.4 ref_rs376238049_reverse_flipped rs376238049 -0.5 1.4 1.25 1.67 \ No newline at end of file diff --git a/test_data/random_name.tsv b/test_data/random_name.tsv index e5b80a0a..b1696323 100644 --- a/test_data/random_name.tsv +++ b/test_data/random_name.tsv @@ -1,5 +1,5 @@ -chromosome base_pair_location effect_allele other_allele beta standard_error effect_allele_frequency p_value info rsid zscore odds_ratio -1 693731 C T 0.01 0.00806496 0.002779 0.1 ref_rs12238997_A_G NA 0.02 0.03 -1 935393 G GCCACGGG -0.01 0.00806496 0.997221 0.1 ref_rs1469404497_G_GCCACGGG_norsid_flipped NA -0.02 -0.03 -1 935475 GCG G -0.01 0.00806496 0.997221 0.1 ref_rs1014128468_CGC_C_norsid_flipped NA -0.02 -0.03 -22 16052962 T C -0.00477642 0.0164749 0.089851 0.77 ref_rs376238049_G_A NA 0.02 0.03 \ No newline at end of file +chromosome base_pair_location effect_allele other_allele beta standard_error effect_allele_frequency neg_log_10_p_value info rsid zscore odds_ratio ci_lower ci_upper +1 693731 G A 0.1 0.2 0.3 0.4 ref_rs12238997_forward_noflip rs61769350 0.5 0.7 0.6 0.8 +1 1 CCCGTGGC C 0.1 0.2 0.3 0.4 ref_rs1469404497_reverse_noflip rs1469404497 0.5 0.7 0.6 0.8 +1 935475 CGC C -0.1 0.2 0.7 0.4 ref_rs1014128468_forward_flipped NA -0.5 1.4 1.25 1.67 +22 16052962 C T -0.1 0.2 0.7 0.4 ref_rs376238049_reverse_flipped rs376238049 -0.5 1.4 1.25 1.67 \ No newline at end of file From 0c84d17d4ebb16303cf39a2ab14a0af5c50fceb8 Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Fri, 1 Nov 2024 14:51:40 +0000 Subject: [PATCH 18/24] avoid repeatedly vcf download If the job restarted with more memory, redownload the vcf file is unnecessary. --- modules/local/get_vcf_files.nf | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/modules/local/get_vcf_files.nf b/modules/local/get_vcf_files.nf index d4a6da81..81045237 100644 --- a/modules/local/get_vcf_files.nf +++ b/modules/local/get_vcf_files.nf @@ -19,9 +19,18 @@ process get_vcf_files { shell: """ - mkdir -p $params.ref - wget -P $params.ref ${params.remote_vcf_location}/homo_sapiens-${chr}.vcf.gz - tabix -f -p vcf ${params.ref}/homo_sapiens-${chr}.vcf.gz + # Check if the directory exists; if not, create it + [[ -d $params.ref ]] || mkdir -p $params.ref + + # Check if the VCF file already exists; if not, download it + if [[ ! -f $params.ref/homo_sapiens-${chr}.vcf.gz ]]; then + wget -P $params.ref ${params.remote_vcf_location}/homo_sapiens-${chr}.vcf.gz + fi + + # Check if the index file exists; if not, create it + if [[ ! -f $params.ref/homo_sapiens-${chr}.vcf.gz.tbi ]]; then + tabix -f -p vcf $params.ref/homo_sapiens-${chr}.vcf.gz + fi vcf2parquet_nf.py \ -f ${params.ref}/homo_sapiens-${chr}.vcf.gz \ From a7400c495207a3bc08a879e3cf2ab1aa0c614811 Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Fri, 1 Nov 2024 18:30:55 +0000 Subject: [PATCH 19/24] simplify the test.config simplify the test. config, only add requirements on resources and remove params.ftp --- config/gwascatalog.config | 2 +- config/test.config | 75 +++++++++------------------------------ main.nf | 20 ++++++++--- 3 files changed, 32 insertions(+), 65 deletions(-) diff --git a/config/gwascatalog.config b/config/gwascatalog.config index eb496015..b69fb9cb 100644 --- a/config/gwascatalog.config +++ b/config/gwascatalog.config @@ -4,7 +4,7 @@ process { memory = { 5.GB * task.attempt } time = { 1.h * task.attempt } publishDir =[ - path:{"${params.failed}"}, + path:{"${params.ftp}"}, mode: 'move' ] } diff --git a/config/test.config b/config/test.config index d53f652f..71a0b693 100644 --- a/config/test.config +++ b/config/test.config @@ -9,84 +9,41 @@ params { } process{ + + process.executor = 'local' withName:map_to_build { - memory = { 3.GB * task.attempt } - time = { 3.h * task.attempt } - publishDir =[ - path:{"${launchDir}/$GCST/1_map_to_build"}, - mode: 'copy' - ] + memory = { 3.GB } + time = { 3.h } } withName:ten_percent_counts { - memory = { 3.GB * task.attempt } - time = { 3.h * task.attempt } - publishDir =[ - path:{"${launchDir}/$GCST/2_ten_sc"}, - mode: 'copy' - ] - } - - withName:ten_percent_counts_sum { - publishDir =[ - path:{"${launchDir}/$GCST"}, - mode: 'copy' - ] + memory = { 3.GB } + time = { 3.h } } withName:generate_strand_counts { - memory = { 3.GB * task.attempt } - time = { 3.h * task.attempt } - publishDir =[ - path:{"${launchDir}/$GCST/3_all_sc"}, - mode: 'copy' - ] - } - - withName:summarise_strand_counts { - publishDir =[ - path:{"${launchDir}/$GCST"}, - mode: 'copy' - ] + memory = { 3.GB } + time = { 3.h } } withName:harmonization { - memory = { 3.GB * task.attempt } - time = { 3.h * task.attempt } - publishDir =[ - path:{"${launchDir}/$GCST/4_harmonization"}, - mode: 'copy' - ] - } - - withName:concatenate_chr_splits { - publishDir =[ - path:{"${launchDir}/$GCST/5_qc"}, - mode: 'copy' - ] + memory = { 3.GB } + time = { 3.h } } withName:qc { - memory = { 3.GB * task.attempt } - time = { 1.h * task.attempt } - publishDir =[ - path:{"${launchDir}/$GCST/5_qc"}, - mode: 'copy' - ] + memory = { 3.GB } + time = { 1.h } } withName:harmonization_log { - memory = { 3.GB * task.attempt } - time = { 1.h * task.attempt } + memory = { 3.GB } + time = { 1.h } } withName:update_meta_yaml { - memory = { 2.GB * task.attempt } - time = { 1.h * task.attempt } - publishDir =[ - path:{"${launchDir}/$GCST/final"}, - mode: 'copy' - ] + memory = { 2.GB } + time = { 1.h } } } diff --git a/main.nf b/main.nf index 770e1938..85c21a7c 100644 --- a/main.nf +++ b/main.nf @@ -66,27 +66,37 @@ workflow NFCORE_GWASCATALOGHARM { if (params.reference) { println ("Prepare the reference ...") PREPARE_REFERENCE() - } else if (params.gwascatalog) { + } + else if (params.gwascatalog) { if (!params.to_harm_folder) { println " ERROR: You didn't set any folder to be harmonised \ Please set --to_harm_folder and try again (: " System.exit(1) - } else { + } + else if (!params.ftp) { + println " ERROR: You didn't set any folder to store your final result \ + Please set --ftp and try again (: " + System.exit(1) + } + else { println ("Harmonizing files in the folder ${params.all_harm_folder}") GWASCATALOGHARM_GWASCATALOG() } - } else if (params.harm) { + } + else if (params.harm) { if (!params.file && !params.list) { println " ERROR: You didn't set any files to be harmonised \ Please set --file for a single input file or \ set --list for a list containing all files are waiting to be harmonised \ and try again (: " System.exit(1) - } else { + } + else { println ("Start harmonising files") GWASCATALOGHARM() } - } else { + } + else { println " ERROR: You didn't set any model to run the pipeline \ Please set --harm and try again (: " System.exit(1) From f394d4097938adb5b813f38cba0e2afb98ef32cd Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Fri, 1 Nov 2024 21:44:02 +0000 Subject: [PATCH 20/24] update readme --- README.md | 42 ++++++++++++++++++++---- workflows/gwascatalogharm.nf | 18 +++++----- workflows/gwascatalogharm_gwascatalog.nf | 15 +++++++-- 3 files changed, 59 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 4acdd474..d3e64351 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # gwas-sumstats-harmoniser +👋 Welcome to our first release of the [documentation](https://ebispot.github.io/gwas-sumstats-harmoniser-documentation/) for gwas-sumstat-harmoniser!👋 + GWAS Summary Statistics Data Harmonisation pipeline aims to bring the variants to the desired genome assembly and then harmonises variants to match variants in reference data. The harmonisation process is the following: @@ -32,12 +34,14 @@ This repository is stored in the Nextflow home directory, that is by default the The resource bundle is a collection of standard files for harmonising GWAS summary statistics data. We support the Ensembl variants VCF reference (hg38, dbSNP 151) and synonyms table . These files can be directly downloaded from our [FTP](https://ftp.ebi.ac.uk/pub/databases/gwas/harmonisation_resources/) server. +**OR** + Users can also prepare your own reference: ``` nextflow run EBISPOT/gwas-sumstats-harmoniser \ --reference \ --ref 'full path to store reference' \ --profile cluster,singularity/conda (running on the cluster) or -profile standard,docker/conda (running locally) +-profile executor,singularity/conda (running on the cluster) or -profile standard,docker/conda (running locally) ``` Default reference were originally downloaded from ``` @@ -52,8 +56,25 @@ If you want to only run specific chromsomes, `--chrom 22` or `--chromlist 22,X,Y ### 3.1 General users Step1: Prepare input file: -* Files are correctly formatted using the validator. -* The name must follow the convention _.tsv e.g. my_summary_stats_37.tsv (37 denotes the genome assembly of the data in the file is hg19 or GRCh37) +* Sumstats are correctly formatted using the validator. +* Yaml file containing `genome_assembly` and `coordinate_system` +``` +# Study meta-data +date_metadata_last_modified: 2023-02-09 + +# Genotyping Information +genome_assembly: GRCh37 +coordinate_system: 1-based + +# Summary Statistic information +data_file_name: gwas_sumstat_name.tsv +file_type: GWAS-SSF v0.1 +data_file_md5sum: 32ce41c3dca4cd9f463a0ce7351966fd + +# Harmonization status +is_harmonised: false +is_sorted: false +``` Step2: Run the pipeline. @@ -64,7 +85,7 @@ nextflow run EBISPOT/gwas-sumstats-harmoniser \ --ref 'full path to store reference' \ --harm \ --file Full_path_of_the_file_to_be_harmonised or --list path_of_list.txt \ --profile cluster,singularity/conda or -profile standard,docker/conda +-profile executor,singularity/conda or -profile standard,docker/conda ``` Harmonising a batch of files in list.txt file, which is a txt file that each row is a full path of tsv files to be harmonised. @@ -77,7 +98,7 @@ We constructed a customized pipeline for GWAS catalog daily running. This pipeli nextflow run EBISPOT/gwas-sumstats-harmoniser \ --ref 'full path to store reference' \ --gwascatalog \ --profile cluster,singularity/ +-profile executor,singularity/ ``` ### 3.3. Other options: @@ -106,4 +127,13 @@ Conda environments are stored on the file system. By default Nextflow instructs [Nextflow’s documentation](https://www.nextflow.io/docs/latest/executor.html). # 4. Harmonisation steps: -More information about the harmonisation process refers to [GWAS catalog documents](https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics) +More information about the harmonisation process refers to [GWAS catalog documents](https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics) and [our documentation](https://ebispot.github.io/gwas-sumstats-harmoniser-documentation/) + +# 5. Contact us: +🫶 We'd love to hear from you! + +* To provide feedback or ask a question, contact the GWAS Catalog team on gwas-info@ebi.ac.uk. +* If you believe you’ve encountered a bug, please don’t hesitate to report it in our [GitHub repository](https://github.com/EBISPOT/gwas-sumstats-harmoniser/issues/new?assignees=&labels=bug&projects=&template=bug_report.yml&title=%5BBug%5D%3A+). +* If you encounter any issues while running the pipeline, feel free to join the discussion in the [gwas-sumstats-harmoniser Discussions](https://github.com/EBISPOT/gwas-sumstats-harmoniser/discussions)! + + diff --git a/workflows/gwascatalogharm.nf b/workflows/gwascatalogharm.nf index 2ff29046..8c3cdb8c 100644 --- a/workflows/gwascatalogharm.nf +++ b/workflows/gwascatalogharm.nf @@ -76,18 +76,20 @@ workflow GWASCATALOGHARM { quality_control(main_harm.out.hm,major_direction.out.direction_sum,files,ch_for_direction,major_direction.out.unmapped) } -def input_files(Path input) -{ +def input_files(input) { def baseName = input.getName().split("\\.")[0] - - // Check if input name matches the pattern GCST[0-9]+ - if (baseName ==~ /GCST\d+/) { + + // Check if the base name matches the pattern GCST[0-9]+ + def matcher = (baseName=~ /GCST\d+/).findAll() + if (matcher) { // Extract GCST ID using regex find - def gcstId = (baseName =~ /GCST\d+/).findAll()[0] - return [gcstId, input + "-meta.yaml", input] + println "yes,GCST" + def gcstId = matcher[0] // Get the first match + return [gcstId, input+"-meta.yaml", input] } else { // Default case - return [baseName, input + "-meta.yaml", input] + println "no,other setting" + return [baseName, input+"-meta.yaml", input] } } /* diff --git a/workflows/gwascatalogharm_gwascatalog.nf b/workflows/gwascatalogharm_gwascatalog.nf index e3eea2fc..0df3e037 100644 --- a/workflows/gwascatalogharm_gwascatalog.nf +++ b/workflows/gwascatalogharm_gwascatalog.nf @@ -72,8 +72,19 @@ workflow GWASCATALOGHARM_GWASCATALOG { //[GCST009150, SUCCESS_HARMONIZATION, copied] } -def input_list(Path input) { - return [(input.getName()=~ /GCST\d+/).findAll()[0],input+"-meta.yaml",input] +def input_files(input) { + def baseName = input.getName().split("\\.")[0] + + // Check if the base name matches the pattern GCST[0-9]+ + def matcher = (baseName=~ /GCST\d+/).findAll() + if (matcher) { + // Extract GCST ID using regex find + def gcstId = matcher[0] // Get the first match + return [gcstId, input+"-meta.yaml", input] + } else { + // Default case + return [baseName, input+"-meta.yaml", input] + } } From 759f29408e153392df95f9d5a7449f78c18e651f Mon Sep 17 00:00:00 2001 From: jiyue1214 Date: Sun, 3 Nov 2024 01:23:20 +0000 Subject: [PATCH 21/24] decrease test config memory 1. not output neg_log_10_p_value in the output 2. decrease the memory need for test data to 1G --- bin/main_pysam.py | 7 +++- config/test.config | 45 ++++++++++++++++-------- main.nf | 4 +-- nextflow.config | 6 ++-- workflows/gwascatalogharm_gwascatalog.nf | 2 +- 5 files changed, 44 insertions(+), 20 deletions(-) diff --git a/bin/main_pysam.py b/bin/main_pysam.py index 7d1632a4..7dd6ae93 100755 --- a/bin/main_pysam.py +++ b/bin/main_pysam.py @@ -28,9 +28,14 @@ def main(): header_written = False strand_counter = Counter() code_counter = Counter() + if args.hm_sumstats: out_handle = open_gzip(args.hm_sumstats, "wb") out_header = SumStatsTable(sumstats_file=args.sumstats)._set_header_order() + tag_neg_log_10_p_value=False + if "neg_log_10_p_value" in out_header: + out_header.remove("neg_log_10_p_value") + tag_neg_log_10_p_value=True #######YUE################ tbx=pysam.TabixFile(args.vcf) @@ -161,7 +166,7 @@ def main(): out_raw["ci_upper"] = ss_rec.oddsr_upper if ss_rec.oddsr_upper is not None and ss_rec.is_harmonised else args.na_rep_out out_raw["effect_allele_frequency"] = ss_rec.eaf if ss_rec.eaf is not None and ss_rec.is_harmonised else args.na_rep_out # Process the neg_log_10_p_value - if "neg_log_10_p_value" in out_header: + if tag_neg_log_10_p_value == True: out_raw["p_value"] = 10**(float(ss_rec.data["neg_log_10_p_value"])*(-1)) if ss_rec.data["neg_log_10_p_value"] is not None else args.na_rep_out else: out_raw["p_value"]=ss_rec.data["p_value"] if ss_rec.data["p_value"] is not None else args.na_rep_out diff --git a/config/test.config b/config/test.config index 71a0b693..f7f34715 100644 --- a/config/test.config +++ b/config/test.config @@ -13,37 +13,54 @@ process{ process.executor = 'local' withName:map_to_build { - memory = { 3.GB } - time = { 3.h } + memory = { 1.GB } + publishDir =[ + path:{"${launchDir}/$GCST/1_map_to_build"}, + mode: 'copy' + ] } withName:ten_percent_counts { - memory = { 3.GB } - time = { 3.h } + memory = { 1.GB } + publishDir =[ + path:{"${launchDir}/$GCST/2_ten_sc"}, + mode: 'copy' + ] } withName:generate_strand_counts { - memory = { 3.GB } - time = { 3.h } + memory = { 1.GB } + publishDir =[ + path:{"${launchDir}/$GCST/3_all_sc"}, + mode: 'copy' + ] } withName:harmonization { - memory = { 3.GB } - time = { 3.h } + memory = { 1.GB } + publishDir =[ + path:{"${launchDir}/$GCST/4_harmonization"}, + mode: 'copy' + ] } withName:qc { - memory = { 3.GB } - time = { 1.h } + memory = { 1.GB } + publishDir =[ + path:{"${launchDir}/$GCST/5_qc"}, + mode: 'copy' + ] } withName:harmonization_log { - memory = { 3.GB } - time = { 1.h } + memory = { 1.GB } } withName:update_meta_yaml { - memory = { 2.GB } - time = { 1.h } + memory = { 1.GB } + publishDir =[ + path:{"${launchDir}/$GCST/final"}, + mode: 'copy' + ] } } diff --git a/main.nf b/main.nf index 85c21a7c..0b4c8075 100644 --- a/main.nf +++ b/main.nf @@ -68,9 +68,9 @@ workflow NFCORE_GWASCATALOGHARM { PREPARE_REFERENCE() } else if (params.gwascatalog) { - if (!params.to_harm_folder) { + if (!params.all_harm_folder) { println " ERROR: You didn't set any folder to be harmonised \ - Please set --to_harm_folder and try again (: " + Please set --all_harm_folder and try again (: " System.exit(1) } else if (!params.ftp) { diff --git a/nextflow.config b/nextflow.config index d57287e1..44064eab 100644 --- a/nextflow.config +++ b/nextflow.config @@ -67,7 +67,7 @@ if (!params.harm & !params.gwascatalog) { if (params.harm) { if (!params.reference & !params.gwascatalog) { - includeConfig 'config/default_params.config' + includeConfig './config/default_params.config' includeConfig './config/basic.config' if (params.chromlist){ params.chrom = params.chromlist?.tokenize(',') as List @@ -77,7 +77,9 @@ if (!params.reference & !params.gwascatalog) { if (params.gwascatalog) { if (!params.harm & !params.reference) { - includeConfig 'config/gwascatalog.config' + includeConfig './config/default_params.config' + includeConfig './config/basic.config' + includeConfig './config/gwascatalog.config' } } diff --git a/workflows/gwascatalogharm_gwascatalog.nf b/workflows/gwascatalogharm_gwascatalog.nf index 0df3e037..e1680651 100644 --- a/workflows/gwascatalogharm_gwascatalog.nf +++ b/workflows/gwascatalogharm_gwascatalog.nf @@ -72,7 +72,7 @@ workflow GWASCATALOGHARM_GWASCATALOG { //[GCST009150, SUCCESS_HARMONIZATION, copied] } -def input_files(input) { +def input_list(input) { def baseName = input.getName().split("\\.")[0] // Check if the base name matches the pattern GCST[0-9]+ From 4f82cdf12b808dae3b644c689101bf7c898e3a83 Mon Sep 17 00:00:00 2001 From: jiyue1214 <52689284+jiyue1214@users.noreply.github.com> Date: Sun, 3 Nov 2024 20:14:36 +0000 Subject: [PATCH 22/24] Update random_name.tsv change zscore into z_score in column name --- test_data/random_name.tsv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_data/random_name.tsv b/test_data/random_name.tsv index b1696323..017c196f 100644 --- a/test_data/random_name.tsv +++ b/test_data/random_name.tsv @@ -1,5 +1,5 @@ -chromosome base_pair_location effect_allele other_allele beta standard_error effect_allele_frequency neg_log_10_p_value info rsid zscore odds_ratio ci_lower ci_upper +chromosome base_pair_location effect_allele other_allele beta standard_error effect_allele_frequency neg_log_10_p_value info rsid z_score odds_ratio ci_lower ci_upper 1 693731 G A 0.1 0.2 0.3 0.4 ref_rs12238997_forward_noflip rs61769350 0.5 0.7 0.6 0.8 1 1 CCCGTGGC C 0.1 0.2 0.3 0.4 ref_rs1469404497_reverse_noflip rs1469404497 0.5 0.7 0.6 0.8 1 935475 CGC C -0.1 0.2 0.7 0.4 ref_rs1014128468_forward_flipped NA -0.5 1.4 1.25 1.67 -22 16052962 C T -0.1 0.2 0.7 0.4 ref_rs376238049_reverse_flipped rs376238049 -0.5 1.4 1.25 1.67 \ No newline at end of file +22 16052962 C T -0.1 0.2 0.7 0.4 ref_rs376238049_reverse_flipped rs376238049 -0.5 1.4 1.25 1.67 From 8d9b3a660ca5401082499d2984470ea112828978 Mon Sep 17 00:00:00 2001 From: jiyue1214 <52689284+jiyue1214@users.noreply.github.com> Date: Sun, 3 Nov 2024 20:21:25 +0000 Subject: [PATCH 23/24] Update GCST0.tsv --- test_data/GCST0.tsv | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test_data/GCST0.tsv b/test_data/GCST0.tsv index 9e9c95a6..b9e74481 100644 --- a/test_data/GCST0.tsv +++ b/test_data/GCST0.tsv @@ -1,5 +1,5 @@ -chromosome base_pair_location effect_allele other_allele beta standard_error effect_allele_frequency p_value info rsid -1 693730 A G -0.016619 0.00806496 0.997221 0.1 ref_rs12238997 NA -1 935393 G GCCACGGG -0.016619 0.00806496 0.997221 0.1 ref_rs1469404497 rs1469404497 -1 935474 CGC C -0.016619 0.00806496 0.997221 0.1 ref_rs1014128468 NA -22 16052961 T C -0.00477642 0.0164749 0.089851 0.77 ref_rs376238049 NA \ No newline at end of file +chromosome base_pair_location effect_allele other_allele beta standard_error effect_allele_frequency p_value info rsid z_score odds_ratio ci_lower ci_upper +1 693730 G A 0.1 0.2 0.3 0.4 ref_rs12238997_forward_noflip NA 0.5 0.7 0.6 0.8 +1 935393 CCCGTGGC C 0.1 0.2 0.3 0.4 ref_rs1469404497_reverse_noflip rs1469404497 0.5 0.7 0.6 0.8 +1 935474 CGC C -0.1 0.2 0.7 0.4 ref_rs1014128468_forward_flipped NA -0.5 1.4 1.25 1.67 +22 16052961 C T -0.1 0.2 0.7 0.4 ref_rs376238049_reverse_flipped rs376238049 -0.5 1.4 1.25 1.67 From 828544fa8fc2e614e6a520946ff4ce073e82e79f Mon Sep 17 00:00:00 2001 From: jiyue1214 <52689284+jiyue1214@users.noreply.github.com> Date: Sun, 3 Nov 2024 22:37:50 +0000 Subject: [PATCH 24/24] Update main_pysam.py delete print the z-score value --- bin/main_pysam.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/main_pysam.py b/bin/main_pysam.py index 7dd6ae93..f697a565 100755 --- a/bin/main_pysam.py +++ b/bin/main_pysam.py @@ -152,7 +152,6 @@ def main(): # # Write ssrec to output ------------------------------------------------ # - print("ss_rec.zscore:",ss_rec.zscore) if args.hm_sumstats: out_raw = OrderedDict() out_raw["chromosome"] = ss_rec.hm_chrom if vcf_rec and ss_rec.is_harmonised else args.na_rep_out