6_process_results.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Ed Mountjoy
#
'''
Processes results for the genetics portal. Processing includes:
1. Make the table symmetrical again
2. Filter to keep only left_type == gwas
3. Only keep the top colocalising result if multiple right loci were tested
4. Filter to remove colocs where small number of variants overlapped.
'''

'''
# Set SPARK_HOME and PYTHONPATH to use 2.4.0
export PYSPARK_SUBMIT_ARGS="--driver-memory 8g pyspark-shell"
export SPARK_HOME=/Users/em21/software/spark-2.4.0-bin-hadoop2.7
export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-2.4.0-src.zip:$PYTHONPATH
'''
import gzip
from glob import glob

import pyspark.sql
from pyspark.sql import Window
from pyspark.sql.functions import *
from pyspark.sql.types import *


def main():

    # Make spark session
    spark = (
        pyspark.sql.SparkSession.builder
        .config("spark.master", "local[*]")
        .getOrCreate()
    )
    # sc = spark.sparkContext
    print('Spark version: ', spark.version)

    # File args
    in_parquet = '/output/coloc_raw.parquet'
    out_parquet = '/output/coloc_processed.parquet'
    # in_parquet = '/Users/em21/Projects/genetics-colocalisation/tmp/coloc_raw.parquet'
    # out_parquet = '/Users/em21/Projects/genetics-colocalisation/tmp/coloc_processed.parquet'
    in_phenotype_maps = '/configs/phenotype_id_gene_luts/*.tsv.gz'

    # Results parameters
    make_symmetric = True # Will make the coloc matrix symmetric
    left_gwas_only = True # Output will only contains rows where left_type == gwas
    deduplicate_right = True # For each left dataset, only keep the "best" right dataset
    min_overlapping_vars = 100 # Only keep results with this many overlapping vars

    # Load
    df = spark.read.parquet(in_parquet) #.limit(100)

    # Rename and calc new columns 
    df = (
        df.withColumnRenamed('PP.H0.abf', 'coloc_h0')
        .withColumn('coloc_h4_h3', (col('coloc_h4') / col('coloc_h3')))
        .withColumn('coloc_log2_h4_h3', log2(col('coloc_h4_h3')))
    )

    # Filter based on the number of snps overlapping the left and right datasets
    last_n = df.count()
    if min_overlapping_vars:
        df = df.filter(col('coloc_n_vars') >= min_overlapping_vars)
        print('{} coloc tests removed for having fewer than {} overlapping variants'.format(last_n - df.count(), min_overlapping_vars))

    # Make symmetric
    if make_symmetric:

        df_rev = df

        # Move all left_ columns to temp_
        for colname in [x for x in df_rev.columns if x.startswith('left_')]:
            df_rev = df_rev.withColumnRenamed(
                colname, colname.replace('left_', 'temp_'))
        
        # Move all right_ columns to left_
        for colname in [x for x in df_rev.columns if x.startswith('right_')]:
            df_rev = df_rev.withColumnRenamed(
                colname, colname.replace('right_', 'left_'))

        # Move all temp_ columns to right_
        for colname in [x for x in df_rev.columns if x.startswith('temp_')]:
            df_rev = df_rev.withColumnRenamed(
                colname, colname.replace('temp_', 'right_'))
        
        # Take union by name between original and flipped dataset
        df = df.withColumn('is_flipped', lit(False))
        df_rev = df_rev.withColumn('is_flipped', lit(True))
        df = df.unionByName(df_rev)

    # Keep only rows where left_type == gwas
    if left_gwas_only:
        last_n = df.count()
        df = df.filter(col('left_type') == 'gwas')
        print('{} coloc tests removed where left_type was not gwas'.format( int(last_n - df.count())/2) )
    
    # Deduplicate right
    if deduplicate_right:

        # Deduplicate the right dataset
        col_subset = [
            'left_type',
            'left_study',
            'left_phenotype',
            'left_bio_feature',
            'left_chrom',
            'left_pos',
            'left_ref',
            'left_alt',
            'right_type',
            'right_study',
            'right_bio_feature',
            'right_phenotype',
            # 'right_chrom',
            # 'right_pos',
            # 'right_ref',
            # 'right_alt'
        ]
        
        # Drop duplicates, keeping first
        last_n = df.count()
        df = drop_duplicates_keep_first(
            df,
            subset=col_subset,
            order_colname='coloc_h4',
            ascending=False
        )
        print('{} coloc tests removed that were duplicates'.format( int((last_n - df.count())/2) ))

    # Add gene_id using phenotype_id
    # Need to handle both eQTLs, which may have phenotype_id as an array probe
    # and sQTLs, which have the gene_id within the phenotype_id field
    phenotype_map = load_pheno_to_gene_map(in_phenotype_maps)
    biofeature_mapper = udf(lambda x: phenotype_map.get(x, x))
    df = (
        df.withColumn('left_gene_id',
                        when(col('left_type') == 'eqtl', biofeature_mapper(col('left_phenotype')))
                        .otherwise(lit(None)))
          .withColumn('right_gene_id',
                        when(col('right_type') == 'eqtl', biofeature_mapper(col('right_phenotype')))
                        .otherwise(lit(None)))
          .withColumn('left_gene_id',
                        when(col('left_type') == 'sqtl', split(col('left_phenotype'), '\^').getItem(4))
                        .otherwise(col('left_gene_id')))
          .withColumn('right_gene_id',
                        when(col('right_type') == 'sqtl', split(col('right_phenotype'), '\^').getItem(4))
                        .otherwise(col('right_gene_id')))
    )

    # Set gene_id to null if it doesn't start with ENSG
    for colname in ['left_gene_id', 'right_gene_id']:
        df = df.withColumn(
            colname,
            when(col(colname).startswith('ENSG'), col(colname))
                .otherwise(lit(None))
        )
    
    # Set phenotype_id and bio_feature to null if they somehow have the value "None"
    for colname in ['left_phenotype', 'left_bio_feature', 'right_phenotype', 'right_bio_feature']:
        df = df.withColumn(
            colname,
            when(col(colname).eqNullSafe('None'), lit(None))
                .otherwise(col(colname))
        )

    # Remove unneeded columns
    df = df.drop('left_sumstat', 'right_sumstat')
    if left_gwas_only:
        df = df.drop('left_gene_id', 'left_bio_feature', 'left_phenotype')

    # Remove rows that have null in coloc stat columns
    last_n = df.count()
    df = df.dropna(
        subset=['coloc_h3', 'coloc_h4', 'coloc_log2_h4_h3'],
        how='any'
    )
    print('{} coloc tests removed for having NA values for H4 or H3'.format( int((last_n - df.count())/2) ))

    # Repartition
    df = (
        df.repartitionByRange(100, 'left_chrom', 'left_pos')
        .sortWithinPartitions('left_chrom', 'left_pos')
    )

    # Write
    (
        df
        .write.parquet(
            out_parquet,
            mode='overwrite'
        )
    )

    return 0


def drop_duplicates_keep_first(df, subset, order_colname, ascending=True):
    ''' Implements the equivalent pd.drop_duplicates(keep='first')
    Args:
        df (spark df)
        subset (list): columns to partition by
        order_colname (str): column to sort by
        ascending (bool): whether to sort ascending
    Returns:
        df
    '''
    assert isinstance(subset, list)

    # Get order column ascending or descending
    if ascending:
        order_col = col(order_colname)
    else:
        order_col = col(order_colname).desc()

    # Specfiy window spec
    window = Window.partitionBy(*subset).orderBy(
        order_col, 'tiebreak')
    # Select first
    res = (
        df
        .withColumn('tiebreak', monotonically_increasing_id())
        .withColumn('rank', rank().over(window))
        .filter(col('rank') == 1)
        .drop('rank', 'tiebreak')
    )
    return res


def load_pheno_to_gene_map(infs):
    ''' Loads a dictionary, mapping phenotype_ids to ensembl gene IDs.
        Input files should have 2 columns phenotype_id, gene_id
    '''
    d = {}

    for inf in glob(infs):

        with gzip.open(inf, 'r') as in_h:

            # Skip header
            header = (
                in_h.readline()
                    .decode()
                    .rstrip()
                    .split('\t')
            )

            # Load each line into dict
            for line in in_h:
                parts = line.decode().rstrip().split('\t')
                if not parts[header.index('gene_id')].startswith('ENSG'):
                    continue
                d[parts[header.index('phenotype_id')]] = \
                    parts[header.index('gene_id')]
    
    return d


if __name__ == '__main__':

    main()