Merge pull request #112 from EBISPOT/dev

Updates to a new version v1.1.10 (GWAS_SSF)
EBISPOT · Nov 4, 2024 · 436c17a · 436c17a
2 parents 0d9639f + 828544f
commit 436c17a
Show file tree

Hide file tree

Showing 39 changed files with 442 additions and 378 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,7 @@
 # gwas-sumstats-harmoniser
 
+👋 Welcome to our first release of the [documentation](https://ebispot.github.io/gwas-sumstats-harmoniser-documentation/) for gwas-sumstat-harmoniser!👋
+
 GWAS Summary Statistics Data Harmonisation pipeline aims to bring the variants to the desired genome assembly and then harmonises variants to match variants in reference data.
 
 The harmonisation process is the following:
@@ -32,12 +34,14 @@ This repository is stored in the Nextflow home directory, that is by default the
 
 The resource bundle is a collection of standard files for harmonising GWAS summary statistics data. We support the Ensembl variants VCF reference (hg38, dbSNP 151) and synonyms table . These files can be directly downloaded from our [FTP](https://ftp.ebi.ac.uk/pub/databases/gwas/harmonisation_resources/) server.
 
+**OR**
+
 Users can also prepare your own reference:
 ```
 nextflow run  EBISPOT/gwas-sumstats-harmoniser \
 --reference \
 --ref 'full path to store reference' \
--profile cluster,singularity/conda (running on the cluster) or -profile standard,docker/conda  (running locally)
+-profile executor,singularity/conda (running on the cluster) or -profile standard,docker/conda  (running locally)
 ```
 Default reference were originally downloaded from 
 ```
@@ -52,8 +56,25 @@ If you want to only run specific chromsomes, `--chrom 22` or `--chromlist 22,X,Y
 ### 3.1 General users
 
 Step1: Prepare input file:
-* Files are correctly formatted using the validator.
-* The name must follow the convention <any identifier>_<genome assembly number>.tsv e.g. my_summary_stats_37.tsv (37 denotes the genome assembly of the data in the file is hg19 or GRCh37)
+* Sumstats are correctly formatted using the validator.
+* Yaml file containing `genome_assembly` and `coordinate_system`
+```
+# Study meta-data
+date_metadata_last_modified: 2023-02-09
+
+# Genotyping Information
+genome_assembly: GRCh37
+coordinate_system: 1-based
+
+# Summary Statistic information
+data_file_name: gwas_sumstat_name.tsv
+file_type: GWAS-SSF v0.1
+data_file_md5sum: 32ce41c3dca4cd9f463a0ce7351966fd
+
+# Harmonization status
+is_harmonised: false
+is_sorted: false
+``` 
 
 Step2: Run the pipeline.
 
@@ -64,7 +85,7 @@ nextflow run  EBISPOT/gwas-sumstats-harmoniser \
 --ref 'full path to store reference' \
 --harm \
 --file Full_path_of_the_file_to_be_harmonised or --list path_of_list.txt \
--profile cluster,singularity/conda or -profile standard,docker/conda
+-profile executor,singularity/conda or -profile standard,docker/conda
 ```
 Harmonising a batch of files in list.txt file, which is a txt file that each row is a full path of tsv files to be harmonised. 
 
@@ -77,7 +98,7 @@ We constructed a customized pipeline for GWAS catalog daily running. This pipeli
 nextflow run  EBISPOT/gwas-sumstats-harmoniser \
 --ref 'full path to store reference' \
 --gwascatalog \
--profile cluster,singularity/
+-profile executor,singularity/
 ```
 
 ### 3.3. Other options:
@@ -106,4 +127,13 @@ Conda environments are stored on the file system. By default Nextflow instructs
 [Nextflow’s documentation](https://www.nextflow.io/docs/latest/executor.html).
 
 # 4. Harmonisation steps:
-More information about the harmonisation process refers to [GWAS catalog documents](https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics)
+More information about the harmonisation process refers to [GWAS catalog documents](https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics) and [our documentation](https://ebispot.github.io/gwas-sumstats-harmoniser-documentation/)
+
+# 5. Contact us:
+🫶 We'd love to hear from you!
+
+* To provide feedback or ask a question, contact the GWAS Catalog team on [email protected].
+* If you believe you’ve encountered a bug, please don’t hesitate to report it in our [GitHub repository](https://github.com/EBISPOT/gwas-sumstats-harmoniser/issues/new?assignees=&labels=bug&projects=&template=bug_report.yml&title=%5BBug%5D%3A+).
+* If you encounter any issues while running the pipeline, feel free to join the discussion in the [gwas-sumstats-harmoniser Discussions](https://github.com/EBISPOT/gwas-sumstats-harmoniser/discussions)!
+
+
diff --git a/bin/basic_qc_nf.py b/bin/basic_qc_nf.py
@@ -30,31 +30,6 @@
 #   - if chr and bp not ints: remove row
 # 5) set chr 'x' and 'y' to 23 and 24
 
-
-
-class sqlClient():
-    def __init__(self, database):
-        self.database = database
-        self.conn = self.create_conn()
-        self.cur = self.conn.cursor()
-
-    def create_conn(self):
-        try:
-            conn = sqlite3.connect(self.database)
-            conn.row_factory = sqlite3.Row
-            return conn
-        except NameError as e:
-            print(e)
-        return None
-
-    def get_synonyms(self, rsid):
-        data = []
-        for row in self.cur.execute("select name from variation_synonym where variation_id in (select variation_id from variation_synonym where name =?)", (rsid,)):
-            data.append(row[0])
-        return data
-
-
-
 hm_header_transformations = {
 
     # variant id
@@ -150,40 +125,6 @@ def drop_last_element_from_filename(filename):
     filename_parts = filename.split('-')
     return '-'.join(filename_parts[:-1])
 
-
-"""
- def resolve_invalid_rsids(row, header, ensembl_client=None, sql_client=None):
-    hm_rsid_idx = header.index('hm_rsid')
-    snp_idx = header.index(RSID)
-    # if possible, set variant_id to harmonised rsid
-    if row[hm_rsid_idx].startswith('rs'):
-        # check that if rsID already present is not synonym of that found in vcf
-        if row[snp_idx].startswith('rs') and row[snp_idx] != row[hm_rsid_idx]:
-            synonyms = []
-            if ensembl_client:
-                rs_info = ensembl_client.get_rsid(row[snp_idx])
-                if rs_info != "NA":
-                    try:
-                        synonyms = rs_info["synonyms"]
-                        synonyms.append(rs_info["name"])
-                    except TypeError:
-                        row[snp_idx] = 'NA'
-            elif sql_client:
-                synonyms = sql_client.get_synonyms(row[snp_idx])
-            print(synonyms)
-            if row[hm_rsid_idx] in synonyms:
-                row[snp_idx] = row[hm_rsid_idx]
-            else:
-                row[snp_idx] = 'NA'
-        else:
-            row[snp_idx] = row[hm_rsid_idx]
-    # if variant_id is doesn't begin 'rs' 
-    if not row[snp_idx].startswith('rs'):
-        row[snp_idx] = 'NA'
-    return row
-"""
-
-
 def get_csv_reader(csv_file):
     dialect = csv.Sniffer().sniff(csv_file.readline())
     csv_file.seek(0)
@@ -237,9 +178,6 @@ def main():
             else:
                 # First try to replace an invalid variant_id with the hm_rsid
                 # Checks for blanks, integers and floats:
-                #sql_client = sqlClient(db) if db else None
-                #ensembl_client = EnsemblRestClient() if not db else None
-                #row = resolve_invalid_rsids(row, header, ensembl_client, sql_client)
                 row = blanks_to_NA(row)
                 row = map_chr_values_to_numbers(row, header)
                 unharmonisable = remove_row_if_unharmonisable(row, header)

diff --git a/bin/common_constants.py b/bin/common_constants.py
@@ -9,6 +9,7 @@
 RANGE_U_DSET = 'ci_upper'
 RANGE_L_DSET = 'ci_lower'
 BETA_DSET = 'beta'
+ZSCORE_DSET = 'z_score'
 RSID = 'rsid'
 SE_DSET = 'standard_error'
 EFFECT_DSET = 'effect_allele'
@@ -20,6 +21,7 @@
 HM_RANGE_U_DSET = 'hm_ci_upper'
 HM_RANGE_L_DSET = 'hm_ci_lower'
 HM_BETA_DSET = 'hm_beta'
+HM_ZSCORE_DSET = 'hm_zscore'
 HM_EFFECT_DSET = 'hm_effect_allele'
 HM_OTHER_DSET = 'hm_other_allele'
 HM_FREQ_DSET = 'hm_effect_allele_frequency'
@@ -28,30 +30,30 @@
 
 
 DSET_TYPES = {SNP_DSET: str, RSID: str, PVAL_DSET: float, MANTISSA_DSET: float, EXP_DSET: int, STUDY_DSET: str,
-              CHR_DSET: int, BP_DSET: int, OR_DSET: float, RANGE_U_DSET: float, RANGE_L_DSET: float, BETA_DSET: float, SE_DSET: float,
+              CHR_DSET: int, BP_DSET: int, OR_DSET: float, RANGE_U_DSET: float, RANGE_L_DSET: float, BETA_DSET: float, SE_DSET: float, ZSCORE_DSET: float, 
               EFFECT_DSET: str, OTHER_DSET: str, FREQ_DSET: float, HM_EFFECT_DSET: str,
-              HM_OTHER_DSET: str, HM_BETA_DSET: float, HM_OR_DSET: float, HM_FREQ_DSET: float, HM_CODE: int,
+              HM_OTHER_DSET: str, HM_BETA_DSET: float, HM_OR_DSET: float, HM_FREQ_DSET: float, HM_CODE: int, HM_ZSCORE_DSET: float,
               HM_VAR_ID: str, HM_RANGE_L_DSET: float, HM_RANGE_U_DSET: float, HM_CC_DSET: str}
 
 REFERENCE_DSET = MANTISSA_DSET
 HARMONISATION_PREFIX = 'hm_'
 GWAS_CATALOG_STUDY_PREFIX = 'GCST'
 
 TO_DISPLAY_DEFAULT = {SNP_DSET, RSID, PVAL_DSET, STUDY_DSET, CHR_DSET, BP_DSET, HM_OR_DSET, HM_RANGE_L_DSET, HM_RANGE_U_DSET,
-                      HM_BETA_DSET, HM_EFFECT_DSET, HM_OTHER_DSET, HM_FREQ_DSET, HM_CODE}
+                      HM_BETA_DSET, HM_ZSCORE_DSET, HM_EFFECT_DSET, HM_OTHER_DSET, HM_FREQ_DSET, HM_CODE}
 
-TO_DISPLAY_RAW = {SNP_DSET, RSID, PVAL_DSET, STUDY_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET,
+TO_DISPLAY_RAW = {SNP_DSET, RSID, PVAL_DSET, STUDY_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET, ZSCORE_DSET,
                   SE_DSET, EFFECT_DSET, OTHER_DSET, FREQ_DSET}
 
 
-TO_LOAD_DSET_HEADERS_DEFAULT = {SNP_DSET, RSID, PVAL_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET,
-                        SE_DSET, EFFECT_DSET, OTHER_DSET, FREQ_DSET, HM_OR_DSET, HM_RANGE_L_DSET, HM_RANGE_U_DSET, HM_BETA_DSET,
+TO_LOAD_DSET_HEADERS_DEFAULT = {SNP_DSET, RSID, PVAL_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET, ZSCORE_DSET,
+                        SE_DSET, EFFECT_DSET, OTHER_DSET, FREQ_DSET, HM_OR_DSET, HM_RANGE_L_DSET, HM_RANGE_U_DSET, HM_BETA_DSET, HM_ZSCORE_DSET,
                                 HM_EFFECT_DSET, HM_OTHER_DSET, HM_FREQ_DSET, HM_CODE}
 TO_STORE_DSETS_DEFAULT = {SNP_DSET, RSID, MANTISSA_DSET, EXP_DSET, STUDY_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET,
-                  BETA_DSET, SE_DSET, EFFECT_DSET, OTHER_DSET, FREQ_DSET, HM_OR_DSET, HM_RANGE_L_DSET, HM_RANGE_U_DSET, HM_BETA_DSET,
+                  BETA_DSET, ZSCORE_DSET, SE_DSET, EFFECT_DSET, OTHER_DSET, FREQ_DSET, HM_OR_DSET, HM_RANGE_L_DSET, HM_RANGE_U_DSET, HM_BETA_DSET, HM_ZSCORE_DSET,
                                 HM_EFFECT_DSET, HM_OTHER_DSET, HM_FREQ_DSET, HM_VAR_ID, HM_CODE}
-TO_QUERY_DSETS_DEFAULT = {SNP_DSET, RSID, MANTISSA_DSET, EXP_DSET, STUDY_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET,
-                  SE_DSET, EFFECT_DSET, OTHER_DSET, FREQ_DSET, HM_OR_DSET, HM_RANGE_L_DSET, HM_RANGE_U_DSET, HM_BETA_DSET,
+TO_QUERY_DSETS_DEFAULT = {SNP_DSET, RSID, MANTISSA_DSET, EXP_DSET, STUDY_DSET, CHR_DSET, BP_DSET, OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET, ZSCORE_DSET,
+                  SE_DSET, EFFECT_DSET, OTHER_DSET, FREQ_DSET, HM_OR_DSET, HM_RANGE_L_DSET, HM_RANGE_U_DSET, HM_BETA_DSET, HM_ZSCORE_DSET,
                                 HM_EFFECT_DSET, HM_OTHER_DSET, HM_FREQ_DSET, HM_VAR_ID, HM_CODE}
 TO_INDEX = {SNP_DSET, RSID, PVAL_DSET, CHR_DSET, BP_DSET}
 REQUIRED = {CHR_DSET, PVAL_DSET, SNP_DSET}#, EFFECT_DSET, OTHER_DSET}
@@ -64,6 +66,7 @@
     RSID: "--rsid_col",
     BETA_DSET: "--beta_col",
     OR_DSET: "--or_col",
+    ZSCORE_DSET: "--zscore_col",
     RANGE_L_DSET: "--or_col_lower",
     RANGE_U_DSET: "--or_col_upper",
     FREQ_DSET: "--eaf_col",

diff --git a/bin/creat_log.py b/bin/creat_log.py
@@ -48,7 +48,7 @@
     count=success.iloc[i,0]
     per=success.iloc[i,1]
     print(key,count,"{0:.2%}".format(per),code_table[key],sep="\t")
-print("\n################################################################\n\n")
+print("\n################################################################\n")
 
 # Failed harmonized variants
 print("\n6. Failed harmonisation\n")
@@ -73,7 +73,7 @@
 for key, count in hm_code_fail_dict.items():
     per = count/all
     print(key,count,"{0:.2%}".format(per),code_table[key],sep="\t")
-print("\n################################################################\n\n")
+print("\n################################################################\n")
 
 
 print("\n7. Overview\n")

diff --git a/bin/fill_NA_for_hm_code15_AMPspecific.py b/bin/fill_NA_for_hm_code15_AMPspecific.py
diff --git a/bin/lib/SumStatRecord.py b/bin/lib/SumStatRecord.py
@@ -4,8 +4,8 @@
 class SumStatRecord:
     """ Class to hold a summary statistic record.
     """
-    def __init__(self, chrom, pos, other_al, effect_al, beta, oddsr,
-                 oddsr_lower, oddsr_upper, eaf, rsid, data,hm_coordinate_conversion):
+    def __init__(self, chrom, pos, other_al, effect_al, beta, zscore, oddsr, 
+                 oddsr_lower, oddsr_upper, eaf, rsid, data, hm_coordinate_conversion):
 
         # Set raw info
         self.chrom = chrom
@@ -15,6 +15,7 @@ def __init__(self, chrom, pos, other_al, effect_al, beta, oddsr,
         self.data = data
         self.beta = safe_float(beta) if beta is not None else None
         self.oddsr = safe_float(oddsr) if oddsr is not None else None
+        self.zscore = safe_float(zscore) if zscore is not None else None
         self.oddsr_lower = safe_float(oddsr_lower) if oddsr_lower is not None else None
         self.oddsr_upper = safe_float(oddsr_upper) if oddsr_upper is not None else None
         self.rsid = str(rsid) if rsid is not None else None
@@ -77,6 +78,12 @@ def flip_beta(self):
         if self.beta:
             if self.beta != 0:
                 self.beta = self.beta * -1
+
+        # Flip Z-score
+        if self.zscore:
+            if self.zscore != 0:
+                self.zscore = self.zscore * -1
+
         # Flip OR
         if self.oddsr:
             self.oddsr = self.oddsr ** -1
@@ -93,6 +100,7 @@ def flip_beta(self):
         # Flip eaf
         if self.eaf:
             self.eaf = 1 - self.eaf
+        #print(f"pos:{self.pos},beta:{self.beta}, OR:{self.oddsr},zscore:{self.zscore},eaf:{self.eaf}")
 
     def alleles(self):
         """
@@ -108,6 +116,7 @@ def __repr__(self):
                           "  other allele : " + str(self.other_al),
                           "  effect allele: " + str(self.effect_al),
                           "  beta         : " + str(self.beta),
+                          "  Z-score        : " + str(self.zscore),
                           "  odds ratio   : " + str(self.oddsr),
                           "  EAF          : " + str(self.eaf)
                           ])