From a6f1bd0b989a4586f2d34d03d96c7048f4b0172a Mon Sep 17 00:00:00 2001 From: Gibran Hemani Date: Sat, 9 Feb 2019 09:13:42 +0000 Subject: [PATCH 01/12] adding bcf support for h2 --- environment.yml | 3 ++- ldsc.py | 5 ++++- ldscore/parse.py | 53 ++++++++++++++++++++++++++++++++++++++++----- ldscore/sumstats.py | 5 ++++- 4 files changed, 57 insertions(+), 9 deletions(-) diff --git a/environment.yml b/environment.yml index 60e24574..1d994fc1 100644 --- a/environment.yml +++ b/environment.yml @@ -9,5 +9,6 @@ dependencies: - pip - pip: - scipy==0.18 - - pandas==0.20 + - pandas==0.23.4 - numpy==1.16 +- pysam diff --git a/ldsc.py b/ldsc.py index aa81340a..80ccf999 100755 --- a/ldsc.py +++ b/ldsc.py @@ -490,7 +490,8 @@ def ldscore(args, log): # Basic Flags for Working with Variance Components parser.add_argument('--h2', default=None, type=str, help='Filename for a .sumstats[.gz] file for one-phenotype LD Score regression. ' - '--h2 requires at minimum also setting the --ref-ld and --w-ld flags.') + '--h2 requires at minimum also setting the --ref-ld and --w-ld flags.' + 'Can alternatively provide a bcf of GWAS summary data in which case ideally you should also specify --snplist flag to list the rs IDs to retain for the analysis.') parser.add_argument('--h2-cts', default=None, type=str, help='Filename for a .sumstats[.gz] file for cell-type-specific analysis. ' '--h2-cts requires the --ref-ld-chr, --w-ld, and --ref-ld-chr-cts flags.') @@ -543,6 +544,8 @@ def ldscore(args, log): parser.add_argument('--ref-ld-chr-cts', default=None, type=str, help='Name of a file that has a list of file name prefixes for cell-type-specific analysis.') parser.add_argument('--print-all-cts', action='store_true', default=False) +parser.add_argument('--snplist', default=None, type=str, + help='Filename for a .snplist[.gz] file which lists the rs IDs to extract from a bcf file being analysed by --h2. One rsID per line.') # Flags for both LD Score estimation and h2/gencor estimation parser.add_argument('--print-cov', default=False, action='store_true', diff --git a/ldscore/parse.py b/ldscore/parse.py index 18fe7c98..3c704afa 100644 --- a/ldscore/parse.py +++ b/ldscore/parse.py @@ -10,6 +10,8 @@ import pandas as pd import os import glob +from pysam import VariantFile +import gzip def series_eq(x, y): @@ -51,7 +53,6 @@ def which_compression(fh): compression = None else: raise IOError('Could not open {F}[./gz/bz2]'.format(F=fh)) - return suffix, compression @@ -61,6 +62,8 @@ def get_compression(fh): compression = 'gzip' elif fh.endswith('bz2'): compression = 'bz2' + elif fh.endswith('bcf'): + compression = 'bcf' else: compression = None @@ -77,7 +80,7 @@ def read_cts(fh, match_snps): return cts.ANNOT.values -def sumstats(fh, alleles=False, dropna=True): +def sumstats(fh, alleles=False, dropna=True, slh=None): '''Parses .sumstats files. See docs/file_formats_sumstats.txt.''' dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str} compression = get_compression(fh) @@ -85,10 +88,16 @@ def sumstats(fh, alleles=False, dropna=True): if alleles: usecols += ['A1', 'A2'] - try: - x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression) - except (AttributeError, ValueError) as e: - raise ValueError('Improperly formatted sumstats file: ' + str(e.args)) + if compression == 'bcf': + try: + x = read_bcf(fh, usecols, slh) + except (AttributeError, ValueError) as e: + raise ValueError('Improperly formatted sumstats file: ' + str(e.args)) + else: + try: + x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression) + except (AttributeError, ValueError) as e: + raise ValueError('Improperly formatted sumstats file: ' + str(e.args)) if dropna: x = x.dropna(how='any') @@ -96,6 +105,38 @@ def sumstats(fh, alleles=False, dropna=True): return x + +def read_bcf(fh, usecols, slh=None): + bcf_in = VariantFile(fh) + + o = [[rec.id, rec.info["EFFECT"][0]/rec.info["SE"][0], rec.info["N"][0]] for rec in bcf_in.fetch()] + p = pd.DataFrame(np.array(o), columns=usecols) + bcf_in.close() + + if slh is not None: + compression = get_compression(slh) + sl = [] + if compression == "gz": + try: + with gzip.open(slh) as f: + for line in f: + sl.append(line.strip()) + except (AttributeError, ValueError) as e: + raise ValueError('Improperly formatted snplist file: ' + str(e.args)) + else: + try: + with open(slh) as f: + for line in f: + sl.append(line.strip()) + except (AttributeError, ValueError) as e: + raise ValueError('Improperly formatted snplist file: ' + str(e.args)) + f.close() + p = p.loc[p['SNP'].isin(sl)] + + return(p) + + + def ldscore_fromlist(flist, num=None): '''Sideways concatenation of a list of LD Score files.''' ldscore_array = [] diff --git a/ldscore/sumstats.py b/ldscore/sumstats.py index 1c57491f..38999775 100644 --- a/ldscore/sumstats.py +++ b/ldscore/sumstats.py @@ -160,7 +160,10 @@ def _read_chr_split_files(chr_arg, not_chr_arg, log, noun, parsefunc, **kwargs): def _read_sumstats(args, log, fh, alleles=False, dropna=False): '''Parse summary statistics.''' log.log('Reading summary statistics from {S} ...'.format(S=fh)) - sumstats = ps.sumstats(fh, alleles=alleles, dropna=dropna) + if args.snplist: + sumstats = ps.sumstats(fh, alleles=alleles, dropna=dropna, slh=args.snplist) + else: + sumstats = ps.sumstats(fh, alleles=alleles, dropna=dropna) log_msg = 'Read summary statistics for {N} SNPs.' log.log(log_msg.format(N=len(sumstats))) m = len(sumstats) From 6733c01708de3b48cbc7e0e682f8d7108161d958 Mon Sep 17 00:00:00 2001 From: Gibran Hemani Date: Sat, 9 Feb 2019 20:21:31 +0000 Subject: [PATCH 02/12] bcf seems to be working --- ldscore/parse.py | 2 +- ldscore/sumstats.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ldscore/parse.py b/ldscore/parse.py index 3c704afa..147e9552 100644 --- a/ldscore/parse.py +++ b/ldscore/parse.py @@ -116,7 +116,7 @@ def read_bcf(fh, usecols, slh=None): if slh is not None: compression = get_compression(slh) sl = [] - if compression == "gz": + if compression == "gzip": try: with gzip.open(slh) as f: for line in f: diff --git a/ldscore/sumstats.py b/ldscore/sumstats.py index 38999775..95191c66 100644 --- a/ldscore/sumstats.py +++ b/ldscore/sumstats.py @@ -161,6 +161,7 @@ def _read_sumstats(args, log, fh, alleles=False, dropna=False): '''Parse summary statistics.''' log.log('Reading summary statistics from {S} ...'.format(S=fh)) if args.snplist: + log.log('and extracting SNPs specified in {S} ...'.format(S=args.snplist)) sumstats = ps.sumstats(fh, alleles=alleles, dropna=dropna, slh=args.snplist) else: sumstats = ps.sumstats(fh, alleles=alleles, dropna=dropna) From 0eef364045d3944e6f2bd1a7859a43e93c59bf28 Mon Sep 17 00:00:00 2001 From: Gibran Hemani Date: Sat, 9 Feb 2019 20:57:37 +0000 Subject: [PATCH 03/12] fix --- ldscore/parse.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ldscore/parse.py b/ldscore/parse.py index 147e9552..a25ce0ec 100644 --- a/ldscore/parse.py +++ b/ldscore/parse.py @@ -90,7 +90,7 @@ def sumstats(fh, alleles=False, dropna=True, slh=None): if compression == 'bcf': try: - x = read_bcf(fh, usecols, slh) + x = read_bcf(fh, alleles, slh) except (AttributeError, ValueError) as e: raise ValueError('Improperly formatted sumstats file: ' + str(e.args)) else: @@ -106,10 +106,15 @@ def sumstats(fh, alleles=False, dropna=True, slh=None): -def read_bcf(fh, usecols, slh=None): +def read_bcf(fh, alleles, slh=None): bcf_in = VariantFile(fh) - o = [[rec.id, rec.info["EFFECT"][0]/rec.info["SE"][0], rec.info["N"][0]] for rec in bcf_in.fetch()] + if alleles: + usecols = ['SNP', 'Z', 'N', 'A1', 'A2'] + o = [[rec.id, rec.info["EFFECT"][0]/rec.info["SE"][0], rec.info["N"][0], rec.alt, rec.ref] for rec in bcf_in.fetch()] + else: + usecols = ['SNP', 'Z', 'N'] + o = [[rec.id, rec.info["EFFECT"][0]/rec.info["SE"][0], rec.info["N"][0]] for rec in bcf_in.fetch()] p = pd.DataFrame(np.array(o), columns=usecols) bcf_in.close() From c74b6b196eafa1918cd7103e38cfad03a8743388 Mon Sep 17 00:00:00 2001 From: Gibran Hemani Date: Mon, 11 Feb 2019 18:06:40 +0000 Subject: [PATCH 04/12] updated bcf parsing --- ldscore/parse.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/ldscore/parse.py b/ldscore/parse.py index a25ce0ec..f93c96b6 100644 --- a/ldscore/parse.py +++ b/ldscore/parse.py @@ -110,12 +110,26 @@ def read_bcf(fh, alleles, slh=None): bcf_in = VariantFile(fh) if alleles: + dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str} usecols = ['SNP', 'Z', 'N', 'A1', 'A2'] - o = [[rec.id, rec.info["EFFECT"][0]/rec.info["SE"][0], rec.info["N"][0], rec.alt, rec.ref] for rec in bcf_in.fetch()] + o = [[rec.id, rec.info["EFFECT"][0]/rec.info["SE"][0], rec.info["N"][0], rec.alts[0], rec.ref] for rec in bcf_in.fetch()] + p = pd.DataFrame( + {'SNP': pd.Series([x[0] for x in o], dtype='str'), + 'Z': pd.Series([x[1] for x in o], dtype='float'), + 'N': pd.Series([x[2] for x in o], dtype='float'), + 'A1': pd.Series([x[3] for x in o], dtype='str'), + 'A2': pd.Series([x[4] for x in o], dtype='str')} + ) else: + dtype_dict = {'SNP': str, 'Z': float, 'N': float} usecols = ['SNP', 'Z', 'N'] o = [[rec.id, rec.info["EFFECT"][0]/rec.info["SE"][0], rec.info["N"][0]] for rec in bcf_in.fetch()] - p = pd.DataFrame(np.array(o), columns=usecols) + p = pd.DataFrame( + {'SNP': pd.Series([x[0] for x in o], dtype='str'), + 'Z': pd.Series([x[1] for x in o], dtype='float'), + 'N': pd.Series([x[2] for x in o], dtype='float')} + ) + bcf_in.close() if slh is not None: From cb9980ff4d70813ed2b7b9424304839e10417329 Mon Sep 17 00:00:00 2001 From: YiLiu6240 Date: Thu, 4 Apr 2019 12:28:48 +0100 Subject: [PATCH 05/12] Convert EFFECT to float for BGC BCF format change --- ldscore/parse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ldscore/parse.py b/ldscore/parse.py index f93c96b6..6ce26fee 100644 --- a/ldscore/parse.py +++ b/ldscore/parse.py @@ -112,7 +112,7 @@ def read_bcf(fh, alleles, slh=None): if alleles: dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str} usecols = ['SNP', 'Z', 'N', 'A1', 'A2'] - o = [[rec.id, rec.info["EFFECT"][0]/rec.info["SE"][0], rec.info["N"][0], rec.alts[0], rec.ref] for rec in bcf_in.fetch()] + o = [[rec.id, float(rec.info["EFFECT"][0])/rec.info["SE"][0], rec.info["N"][0], rec.alts[0], rec.ref] for rec in bcf_in.fetch()] p = pd.DataFrame( {'SNP': pd.Series([x[0] for x in o], dtype='str'), 'Z': pd.Series([x[1] for x in o], dtype='float'), @@ -123,7 +123,7 @@ def read_bcf(fh, alleles, slh=None): else: dtype_dict = {'SNP': str, 'Z': float, 'N': float} usecols = ['SNP', 'Z', 'N'] - o = [[rec.id, rec.info["EFFECT"][0]/rec.info["SE"][0], rec.info["N"][0]] for rec in bcf_in.fetch()] + o = [[rec.id, float(rec.info["EFFECT"][0])/rec.info["SE"][0], rec.info["N"][0]] for rec in bcf_in.fetch()] p = pd.DataFrame( {'SNP': pd.Series([x[0] for x in o], dtype='str'), 'Z': pd.Series([x[1] for x in o], dtype='float'), From 1a4ae0a57afccc19ed45b1d42a4c24d2ac68472a Mon Sep 17 00:00:00 2001 From: Gibran Hemani Date: Fri, 16 Aug 2019 22:43:19 +0100 Subject: [PATCH 06/12] added vcf support --- ldsc.py | 2 +- ldscore/parse.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/ldsc.py b/ldsc.py index 80ccf999..3a12c4be 100755 --- a/ldsc.py +++ b/ldsc.py @@ -491,7 +491,7 @@ def ldscore(args, log): parser.add_argument('--h2', default=None, type=str, help='Filename for a .sumstats[.gz] file for one-phenotype LD Score regression. ' '--h2 requires at minimum also setting the --ref-ld and --w-ld flags.' - 'Can alternatively provide a bcf of GWAS summary data in which case ideally you should also specify --snplist flag to list the rs IDs to retain for the analysis.') + 'Can alternatively provide a vcf/vcf.gz/bcf of GWAS summary data in which case ideally you should also specify --snplist flag to list the rs IDs to retain for the analysis.') parser.add_argument('--h2-cts', default=None, type=str, help='Filename for a .sumstats[.gz] file for cell-type-specific analysis. ' '--h2-cts requires the --ref-ld-chr, --w-ld, and --ref-ld-chr-cts flags.') diff --git a/ldscore/parse.py b/ldscore/parse.py index 6ce26fee..bdc10c6f 100644 --- a/ldscore/parse.py +++ b/ldscore/parse.py @@ -58,12 +58,14 @@ def which_compression(fh): def get_compression(fh): '''Which sort of compression should we use with read_csv?''' - if fh.endswith('gz'): + if fh.endswith('vcf.gz') + compression = 'bcf' + elif fh.endswith('bcf'): + compression = 'bcf' + elif fh.endswith('gz'): compression = 'gzip' elif fh.endswith('bz2'): compression = 'bz2' - elif fh.endswith('bcf'): - compression = 'bcf' else: compression = None From 7d6cec03a2d91a2b96770dbf6dcd6788adda20bf Mon Sep 17 00:00:00 2001 From: Gibran Hemani Date: Mon, 19 Aug 2019 15:07:29 +0100 Subject: [PATCH 07/12] updated to work with revised vcf format --- ldscore/parse.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ldscore/parse.py b/ldscore/parse.py index bdc10c6f..5678eb17 100644 --- a/ldscore/parse.py +++ b/ldscore/parse.py @@ -58,7 +58,7 @@ def which_compression(fh): def get_compression(fh): '''Which sort of compression should we use with read_csv?''' - if fh.endswith('vcf.gz') + if fh.endswith('vcf.gz'): compression = 'bcf' elif fh.endswith('bcf'): compression = 'bcf' @@ -92,7 +92,7 @@ def sumstats(fh, alleles=False, dropna=True, slh=None): if compression == 'bcf': try: - x = read_bcf(fh, alleles, slh) + x = read_vcf(fh, alleles, slh) except (AttributeError, ValueError) as e: raise ValueError('Improperly formatted sumstats file: ' + str(e.args)) else: @@ -108,13 +108,13 @@ def sumstats(fh, alleles=False, dropna=True, slh=None): -def read_bcf(fh, alleles, slh=None): - bcf_in = VariantFile(fh) - +def read_vcf(fh, alleles, slh=None): + vcf_in = VariantFile(fh) + sample = list(vcf_in.header.samples)[0] if alleles: dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str} usecols = ['SNP', 'Z', 'N', 'A1', 'A2'] - o = [[rec.id, float(rec.info["EFFECT"][0])/rec.info["SE"][0], rec.info["N"][0], rec.alts[0], rec.ref] for rec in bcf_in.fetch()] + o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0], rec.alts[0], rec.ref] for rec in vcf_in.fetch()] p = pd.DataFrame( {'SNP': pd.Series([x[0] for x in o], dtype='str'), 'Z': pd.Series([x[1] for x in o], dtype='float'), @@ -125,14 +125,14 @@ def read_bcf(fh, alleles, slh=None): else: dtype_dict = {'SNP': str, 'Z': float, 'N': float} usecols = ['SNP', 'Z', 'N'] - o = [[rec.id, float(rec.info["EFFECT"][0])/rec.info["SE"][0], rec.info["N"][0]] for rec in bcf_in.fetch()] + o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0]] for rec in vcf_in.fetch()] p = pd.DataFrame( {'SNP': pd.Series([x[0] for x in o], dtype='str'), 'Z': pd.Series([x[1] for x in o], dtype='float'), 'N': pd.Series([x[2] for x in o], dtype='float')} ) - bcf_in.close() + vcf_in.close() if slh is not None: compression = get_compression(slh) From aa776fa9ef886bd0aa1d626c1a11cbc466b5fdb7 Mon Sep 17 00:00:00 2001 From: Gibran Hemani Date: Mon, 30 Sep 2019 22:01:08 +0100 Subject: [PATCH 08/12] allowing vcf spec where ss is in header --- ldscore/parse.py | 47 +++++++++++++++++++++++++++++++++++++--------- test/test_parse.py | 16 ++++++++++++++++ 2 files changed, 54 insertions(+), 9 deletions(-) diff --git a/ldscore/parse.py b/ldscore/parse.py index 5678eb17..b8cdf6d2 100644 --- a/ldscore/parse.py +++ b/ldscore/parse.py @@ -94,7 +94,7 @@ def sumstats(fh, alleles=False, dropna=True, slh=None): try: x = read_vcf(fh, alleles, slh) except (AttributeError, ValueError) as e: - raise ValueError('Improperly formatted sumstats file: ' + str(e.args)) + raise ValueError('Improperly formatted bcf/vcf file: ' + str(e.args)) else: try: x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression) @@ -111,25 +111,54 @@ def sumstats(fh, alleles=False, dropna=True, slh=None): def read_vcf(fh, alleles, slh=None): vcf_in = VariantFile(fh) sample = list(vcf_in.header.samples)[0] + availcols = next(vcf_in.fetch()).format.keys() + vcf_in.seek(0) + + # Check if sample size info is in header + global_fields = [x for x in vcf_in.header.records if x.key == "SAMPLE"][0] if alleles: dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str} - usecols = ['SNP', 'Z', 'N', 'A1', 'A2'] - o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0], rec.alts[0], rec.ref] for rec in vcf_in.fetch()] + usecols = list(dtype_dict.keys()) + + # Read in data + if 'SS' in availcols: + o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0], rec.alts[0], rec.ref] for rec in vcf_in.fetch()] + N = pd.Series([x[2] for x in o], dtype='float') + else: + o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0], rec.alts[0], rec.ref] for rec in vcf_in.fetch()] + if 'TotalControls' in global_fields.keys() and 'TotalCases' in global_fields.keys(): + N = pd.Series([float(global_fields['TotalControls']) + float(global_fields['TotalCases'])] * len(o), dtype='float') + elif 'TotalControls' in global_fields.keys(): + N = pd.Series([float(global_fields['TotalControls'])] * len(o), dtype='float') + else: + N = pd.Series([np.NaN] * len(o), dtype='float') + p = pd.DataFrame( {'SNP': pd.Series([x[0] for x in o], dtype='str'), 'Z': pd.Series([x[1] for x in o], dtype='float'), - 'N': pd.Series([x[2] for x in o], dtype='float'), - 'A1': pd.Series([x[3] for x in o], dtype='str'), - 'A2': pd.Series([x[4] for x in o], dtype='str')} + 'N': N, + 'A1': pd.Series([x[2 + int('SS' in availcols)] for x in o], dtype='str'), + 'A2': pd.Series([x[3 + int('SS' in availcols)] for x in o], dtype='str')} ) else: dtype_dict = {'SNP': str, 'Z': float, 'N': float} - usecols = ['SNP', 'Z', 'N'] - o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0]] for rec in vcf_in.fetch()] + usecols = list(dtype_dict.keys()) + if 'SS' in availcols: + o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0]] for rec in vcf_in.fetch()] + N = pd.Series([x[2] for x in o], dtype='float') + else: + o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0]] for rec in vcf_in.fetch()] + if 'TotalControls' in global_fields.keys() and 'TotalCases' in global_fields.keys(): + N = pd.Series([float(global_fields['TotalControls']) + float(global_fields['TotalCases'])] * len(o), dtype='float') + elif 'TotalControls' in global_fields.keys(): + N = pd.Series([float(global_fields['TotalControls'])] * len(o), dtype='float') + else: + N = pd.Series([np.NaN] * len(o), dtype='float') + p = pd.DataFrame( {'SNP': pd.Series([x[0] for x in o], dtype='str'), 'Z': pd.Series([x[1] for x in o], dtype='float'), - 'N': pd.Series([x[2] for x in o], dtype='float')} + 'N': N} ) vcf_in.close() diff --git a/test/test_parse.py b/test/test_parse.py index 85e926e9..4dbd9265 100644 --- a/test/test_parse.py +++ b/test/test_parse.py @@ -21,6 +21,9 @@ def test_series_eq(): def test_get_compression(): + assert_equal(ps.get_compression('vcf'), 'bcf') + assert_equal(ps.get_compression('bcf'), 'bcf') + assert_equal(ps.get_compression('vcf.gz'), 'bcf') assert_equal(ps.get_compression('gz'), 'gzip') assert_equal(ps.get_compression('bz2'), 'bz2') assert_equal(ps.get_compression('asdf'), None) @@ -42,6 +45,19 @@ def test_read_sumstats(): assert_raises(ValueError, ps.sumstats, os.path.join( DIR, 'parse_test/test.l2.ldscore.gz')) +def test_read_vcf(): + x1 = ps.read_vcf("test/vcf_test/example1.vcf.gz", alleles=True) + x2 = ps.read_vcf("test/vcf_test/example1.vcf.gz", alleles=False) + x3 = ps.read_vcf("test/vcf_test/example2.vcf.gz", alleles=True) + x4 = ps.read_vcf("test/vcf_test/example2.vcf.gz", alleles=False) + x5 = ps.read_vcf("test/vcf_test/example3.vcf.gz", alleles=True) + x6 = ps.read_vcf("test/vcf_test/example3.vcf.gz", alleles=False) + assert_equal(len(x1), len(x2)) + assert_equal(len(x3), len(x4)) + assert_equal(len(x5), len(x6)) + assert('rs' in x1.SNP[0]) + assert_raises(ValueError, ps.read_vcf, os.path.join( + DIR, 'test/vcf_test/example1.vcf.gz')) def test_frq_parser(): x = ps.frq_parser(os.path.join(DIR, 'parse_test/test1.frq'), compression=None) From 8425c4b981c01b4d3b59cfbf276ffab8d04b8343 Mon Sep 17 00:00:00 2001 From: Gibran Hemani Date: Mon, 30 Sep 2019 22:02:55 +0100 Subject: [PATCH 09/12] added vcf test files --- test/vcf_test/example1.vcf.gz | Bin 0 -> 4852 bytes test/vcf_test/example2.vcf.gz | Bin 0 -> 4777 bytes test/vcf_test/example3.vcf.gz | Bin 0 -> 4652 bytes 3 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 test/vcf_test/example1.vcf.gz create mode 100644 test/vcf_test/example2.vcf.gz create mode 100644 test/vcf_test/example3.vcf.gz diff --git a/test/vcf_test/example1.vcf.gz b/test/vcf_test/example1.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..c370e1a591256502136358f504cc68824b6416ce GIT binary patch literal 4852 zcmVP)!;^OMf_peVMmz(p8H#a}OEYGhV*Na^vWBIt8U2T?sdtE-?{TwA+ z{qW(ti<>Cp^7^=p%TJ${ce~kk^~-X$-0oITVL96@9~ZmTm*s4?p6xy_XA64H=k%*Z zBt4|MzC2EKy<0rrFE;nH*>2Z&s|9_0w%xtnHy*g2;DPJK(+h-0BfKIG$mqor56kD} zZgn@P(>VAq$Ho8UbbGhnEN3s9^_SH>#IHWhR=e3xi)~@MUv8qaq0!~l)#clp85}jc zzC?g7AMdw~Qtu|%?(Xh&vw)=%Iefi+eR^7KexBVgc8l5S>E(47?RF^q`?tr1rw@?O zyTx|7{r_6+-8Y$nCyMLcX7&7V7_4yT?8{=aT0HM&Fx>NQ1#@hou-KcmW%r-4Bxl1cb^x#+1>i} z@qYHa-pxKzf-mm(nf|!gJguKsu;g!1=W~%#*nd@_k?}Vfu2pV)h<)zspH{oO&&&H% zHh-JDw!01G$aWCfiHf)D=iO%g_;=CV4cPy?&K&SjD*Rn!F^8rx#jGIF6EcYYK=Qx+ zw~Ol}YxbXHl-2X=`gJ=)jiyH)%=BN8=lbHiAKza_ra%45Kc9XK5MO85xp$g7W8NI( z#W{P^#&7S#Qh==m|J@=g}BfQJMBvYLW^`_Y2ywZU*AkVEHq6YlUXBLL0{LTW9QRmYUzyh1Q$ zG(TooE=Gi;xbylLV)QnLeDAC~h8W$=tW;jBW8UMVznRdIOMlF`yg1x5sg>2o+{cT< z9p_#Pe)O4&!>wc13UyTN;&4QiaNZqzBvJfrB-Ga8*dvMJZsjKl15s-!sMpv-44LVUC4H0tqvMP!HHK?+qwl zp*#x1B$VVMN>~(=Nhr>0lyTGalGPI;gV7Tvp>%~(s4UYAU+Ec5DCy9~rlD$oIn5?ngZn)$&nI%Z3gzv@I8>@e zc`+?17$KwK<#lEJuMDdWqHNnifJ}vHOkA0dYOIHL@)EY z9P){JsVe4`lxm`0wGE{f6ZNXCA&sBtue!`@cXWaqP_gJ3YYvTC8YuyazN2luHA=w_xO{?4(^lO+B6AL4+O@j7!bdVaNq&5l7=F{BG>nO2s z<t8>v)*lzoe?9wh_vx1TTkb!eeSG+3 z^>T|}#E;HD67p7|AdB0#4j#c@<`2KjPES|&v*A{Sf5hYV@ANR6#ZRQ$%j@n}9%1)C zyYvWm>(A@W?lv9rgsOg%?}+J$g+FYTpMJ0B`}_(A+&=KX-hRQz@M*Dy>Zhmwy5 zUJUr@mRvt4q5m*LX`XXL1HEj!1!9TKo%>VfPr1BdHb@_kVc0P-w0!mT+dqEz;k%O` zf4Dw*_sz+N%d3-%_ctg1`lpNcC+WT13B77NNv{P?q6bg;SNud_@OCGgEoNfxt(PY^ zCvQ*YC-egE{wg80Pba4LK^UmYGcgKCm&ZHIgr6XehG5<_r^2Iixj1 zNzjdUcr>leA)lBtOcjYj%`r*YOeg5tKn>%p=mriWO~}Yl zM3dhzxx-y}0OObyvSGt2; zB}X>+)D5=u7U6lQoaWl7Ho@f_Y$@ZwEHRqoj3Wg&4;dLo*~G`O4N(qYxN{CoxUDEm zO2+!3k{BuJjKu$PMX}P0NF9-(5*`_~K!KvC;D~Hv+f0xK6`@1uq^q;sWdFs9DrQ1N zhEaCnlg1W4+lC8JR<%V8e2Bx$hjg;K!v|y-Wk)`%+^C=R(C^}qPsyG^oj@JS`iYJ? z2{*F&#HL-;PATk2$oe)sv0t?Q!|yTQ(0d;n47w5~q{zqT1{HvEFX2KZn)vXDE+!vG z{&Q>=P1VF>`X(DrQKJ!dE?d(YE)HvS1cpj<fK12;m`Z0=L-|0bIbm-98cNQP?bQ)Hszc-@Ii-jNJ|@;4!cXbJ%1PFcir|h5#MPibL9Bc4^y! zruSEEWWo)G4_yynlk=d=nF$yw*^v-Fr@p71u!x~f+My!0d>FuVo*~3sqSULyVJrc| zC_2*7doj?#&`3rvb?ZEVA-VM)h8Bq@&Ebhjvdo$=9w)+t43+T6sANh$tVk%{+I2Lf zGTUm1rZ>zEq@gmoF<^?CBFS^M`4n(NkELZaDLtt5E*3z zWHkCY%7=suqwK_IA|I@PC&ozHh71;ODI1eoG*_mRRt{n5WKKYZNnNHQO2|+NCq{*J zfim83YtgMblLXVfUS#Howi1}s2f?HGo)053jItx2aB+m^Xc$&fQnZ9Z*fwa)yi5{_ z&q-!@G^8=ng$Woc*^v-!E1DD&0D3;_%q}#Rw}ci>L_ZoCh4+TXChMuRWMHUd6QLH$ z#_1C~8T3Z=YNiQ{r=qRel!r`}c8lVXvjQ=Skt_p4B|8ynL*pmX1SV={Ym9upGwmWs zg~>U=9J**8<W*nq39hF3=>9h-@;K8ht zwo|u&3`KNeGa@z}48KscO$73IuOwsbWu}CR(cMr<2gW2hv3WB_N_!T=6GBo?H%PE! zP0rmMUEAA`aV1XTZikUju{5C=1J6=i9;imIC)@2#&oLgvG_bS+8D35gj46zqj)#m5 zmH5P|WUOJdv18a3Ehh$I)oO^D0dt3|h6WmzT|B|zVbU)N87kq4(OO+BsykiIclKp8 zIfQ&RC>nS<=TyPs?P$?dGqsD`c?uub?0Cr5b|XQS+tT5%C-y3sDlwVrm!T}n)Sb?a zH0sz8OcLEcb8)XaT!buIB@1H09JOkTr887AI&8{0%L|(pIVingE>2$wS#(Ra(%M)n zY5VfRw;6yfqP-SDHAL80D$%byW{mfnL2%;o@!}mfJtKNE3F~S`-}4)}RBtF@QL#aC zBy(I9r|I^RNu|7V@g_1U9GM>*iuQx z3}e!(3XHHc7xX%+Z~aEYi8T2R#^AQzECghzgcGA-v~I`Z=@4hVLMc1eq$Z$Q8I>(l zoxCPwD53|p^qvX-7S3U~Y;Q%D^=CZ@DQ&=wmASfZ5HeK4BO`s~q9k3rJmM))a16nGy>cO*5*wV!|Hi8{JLSjGL<7TT}rEqe!B$ zjHkC0*aWde_|6c^e7o=DV-VIVxjF}w z<1=M_>8w%?i{0QfAi^j!@(9{1h#Fgv^i7)4v$aM{T6r;QrFu&7q*Rm~$*|a_UBF@} zqt@EQJ(rR-TG=^6nKy&aI9gAptB#cf43+FiXs}ofFJBa;++jotgNoknRebKS$*AGY zE|w*ocjCCD3K23ijKT*(9SJqPaY91$GP`p=(a1IQ>U@sX&gCe{-vNlYP)Uw-7S9sB z=a`J@9rQQLSm&)Lox8Le*8w~9R*I4Y2}Ne25yM*-tYOZ2)@p?bwDUUG!a>^Q`ly{K zS3FMtuc&kzC1j|CM@DB=+~BZn(a_L*+dy=zDABiu_r>wDHneaaUkApQfnu;>k{XG; zj22W3wVQXo)__^5-P~Y6$EqWq$OcTiX~?)x1QVBI*6Cb9xMoPsJD!-X?b4Fj=&XGu8m+&DYLnS;hy6Cdh#5>iW zBWT-8FQ->GESnPRl`Ze!4LwQ7z%YuAbi-YlJ1tPrRcDywa78<9nYl%e(J>|s@?1#Y ziOH6;638I5ufG2J^4<{M;4>O>)i}hyqJl|{< zcYpX7VBURy`2!=)FW-Fs^7wJFK0AMN{mb*>?DAnX-&Qgf4~yC5dhxfH#nbICEaCEp z58s_%vy6+Y!!j;DeOlaZXPf1(i`incT|$M$Y`u7xZ3crqU(K0x zpX%!3FxAy|{&Y8A-^~{5^=ds^eVQ#AEpA__T{efz|{l~T+JV!A)Jlyf;b?f51zPRJT11% z+fJRz!GAq0{;$WI+tqq8dtR@;EbkzG`DwP?&VHV62DZD!nw<@eE-o)G-d@k(sM*y8 z0(9|kx2cqRH^Fwdw=e5CES1UO>&?sK<9z+g>~6lD&z6tRFI%?TzVPqg9u}TIKtgZl zo5kk;YqfXZ^b|5vTy59Or~Cb2g*#_o=IiDBX*+}Ap0-PvW6i?8ySP4QQP->O{NXS8 zLz_LR@1C}c`^9=!ECXgwg907Fh9YYD1f%TA??r#V+J0Ef@1`((U(4Qpo^NNjtCxqn z+0$w}`$!2szuRT{<9z+NdRoGgzeSzTgOq{&RfR^z-(=0oybO1yj?wQ*QJV;yc>EL~dm%A=e#e7nI(ie|47#6LeT=QCxZvcf6( zhFbgM$A@3ezWwm_v-W!cH3GDZQG0y=pnHIfK`5Za0e~3+Qd*f)Ac!cfa)%h) z0PqqN!ubOLKLBi$8s3Hj^p(`$Z>OQQRfoQk8r*GyRPnIy)ZlF^wJ_El3XlI7 zv%)Gb?7_VI&rxU8jl%@rT@MV-x_IEz-@X?@DB({=#g#i08X2VA2tqv&hrHLJVu6Y% z43kh=jwt0(OeUeEC{Zp<(@PiM5E-n@G?Xn+29;%+;S0UAlLs>(ql_<6=3t%dP~|st zsAN-6GL~Fk`a=oP>7`4Qi4*lIB1#9ev1zE%U%`q=){t?BE6Rynp+H4HF%FF>QBh4x zO1;As^~6+GMJfv^CgztZQTAXX?NB>h&Q6O%QCQwcxMG?OMTv@fqF!znHPOqmEQfNU zUZ#k7qqUi+S7}3S)kM8YYiJiI`l~GSHXNMbI#j8bjT7|}MMMRu4-NxeRu(ZI(NZQ> zn-Zm5n3O3Pl#5OzoS7I>sxTsk5IwQdt5Cu)zDzQRl7$(i_FhixvZZdAbmKHvi?TUm z#+o=j$Wk{a<-w`C`;uY@7N%8h4EiO^sELJ9mL@^_J2*&nQBstnr_bxB~{qKEJ@gH}~?Vn=cp%KFv4K@A&wCHmj%Kivd5~kZVs!=s(R+dQT+0 zP48znKrFDC%Xlo}vDDW>B_R@~41~W5>Z`Bc{_(>P-yQwYP<)^kIdObVJm;NL6)G>MLD|uuvTZg0d24ioGKI)_Eqqj#VNAxUrbbfgzP8_-i zetmJ3t}fF1AJh3|0>`m95z=aHW1^ps%|75K=e-Xy^{6z;1(h|b8tIR})H~Ps6M??%tgHX@}ClqUhbQMT-Td5?4F|ROTdDTuc6MDHJorJd@ z>wfBSsSpji2s4mY&eJv~j=tU3%6hH+J{fC;jjGcMsqfj68irH!y9`T%Trep**J)4% z^aA5tWkBI!bK9V@B5G+#K9+9lY?3Ed1ZPr@NrO=G69#d(2h=XW?7HH?v@cv}hJ+76 zvwNh5h42F!Ilg}CL1~bh1Hwg5uEGvxBfNGF^Oa0=W)-fv`ogI+O*?`L3Is z!?$lTNi269bC{b@!^mtATT<#lX%x#;PP@?AFM5RGzD^c{Wu+6xc?1_n95x2&G%JI4 z%JySa8pRTo#2!Ooj#*7-gc3VM?Z3v8~n5ermIGWDMxIb!Cpf%P#6T8m7sz{51I}lsRyM(E|Dn|Oin6^q((r+s{Em%3DG2@D$@mfbl3_}lu(m8;}Iwoa+wTE zpkV8vgsI@bwI6{Fk=qW6s9onNe0O4v*ycr!KtiQaE^{F|L{^!(p=FWLg0b|h2$ibTwe3a^vjrYhD0LMZuuNT>XNpcPb490B z4hWhCz}qNE?ed4PQn3#{@RnVXL8%biad~o;iB<+1E4700R#qb!%h%n^BJmFbUY?-; zaoA-{8iW!LqOU6z0i_<(MIGcQFseEH3ABiim2xUOX^aa-r9v>XNiV5c*?>&Yjf_H` z8`qvR(XVnEp+P!rc|NjHtJI^?D3`hDAqrgDMa~dgdrLWd@1|yS4CvyJ{J%4nWmqaC zGo8^nw+!5h1{WH-p}98cupPS4tHx1pwRa+`Mxa#4Wik{?BJ&Y;v7Weg6Pa#hZ7zi` z?DHf8&npTKjO3itj7g)^%;cC!27D3NLe%j=Q$(`}n%FM~G#KTP9W$-%=w;XRd&xv+ zFt<}WJLx^5s?qc@$jWxpSMZe1g`#U%6!>}QU{o6AGM5TGN=p=d4BDn1a{08{wqj5r zs}Y4qUgaQ^IU5+23c<{V-pPj45xIi#w;nR6%2xFqY#2U0w`8e<_ms)h^9{!$V^SfO zxU4dsXF@DQ&==mR`aedKx|ejCsea19z(Hlh>X3I72Bku7CbN#FLSzDVlX}z*_^qqD zKet0GbFmlnCWN9T^YWN_P#UCWB9u%b!VrvtUD*#7iEW;cg%S@E4TBRGU!Fk;lLnbg zqm8gkBWXOSdSvj_I9in#(dP5iiEs-$BW?zzL24$#7(%TJ`xiyxzTwbO;rM2~aKUEx zgNgHMm}D|24N{rNpkVeRraR)Dju04Xsx;-L*2BMIOl$4{J>VSY5>?=u&VO95T0(8%KHfL=oH*=vb z4NCc{sFNXD2MhY+2xWVnRAoFrsFVwl0S~EUsh?oAsC5#C8cahc>cBqNbBKwgiosOP zXrVs)ta*{AlVp2d*7_WDlFCOpC#p)WO_hx_WN9=<5Y%HY;>rCTzW7p|aGc z1g=IdTGdq}t2I}>?v%ioG|0>}7##UFXGp&~%-Ox|ghH+BZWbZ(G*eV^0;N%Groxyw z#6*2sLhy)JRDxjB7jmJ@PJt!R5EZ;wp?evTX9|b<}y z#hlNCN`>IeW;B;1T8}Y^s+vt$xTLv`=$P$gMIPGG6GL_$K~f<#Q+aO(3))79!gSmu zLS7k}El7W$CFwe+`I?Y1g8_5tvw^qZ94DdLuegl0JjH)g*q0>F6{BcT1RA0*mn27N z)N?mlE?gj5v!8S_kLE;V*oMPyl|h3>*>acwM~&*z7rbmM24M#`5jky#l*SU+=7odwS3nW{{eRB42`e!*Z%IJSe)BG!3SGZ!qiqK2eZq zkjg};PJ?G`ef0yz)Wbv8$PS*dLGKY6F_dtdlNE$+AgWhhU=X71Sc|4NT`+s&5P=4w zTu`x0xqP#?G#UD4|I_Vp96OB)-RzKcDOPzxr9v>VIZUrA*VSM`XcRm8Xfsvmd7vwH z*qA6)9sPMu>OpCc+7ac)75ujf0TW++baA463&!Xr`&nG&b4Yoz5;`AEJHgCGe?~Cm zQN66u$+NzOWVNujbZTkpcROl1D~F5{nFwu?h>%(I_Z6<;EfZcl&m#X#cX;uvR&fg^ zR4N2#HqVzAbQz%G8+6u)8X{T{d85WS;7z3xcK+BtkV&gz>>oW8>oOdzGAfO7nG3Gd{RLBmii!;Eo2ygMBF}nA zT8z!Ah`tO;gVao9Oz6(QbX=-;Cqa$6sT7usToiNh15)iRPHVJ$B+fqf#N5*{DJfQCL6y2KvLI_H=}r*0)B-KMfOe1`SbYGg0JcBXln3 zH0H3z2CwUrMBd;ro_EfYTJJlHZxX#&rh*YH>x5BM$11bv>(%kjI$6~pC7@H-*NMZu zlgdPh**u&3;5_xnD%_=PPfA+n24bj%a5_1k_iKWrLM9Vo;&t6ygrN-Ek3Qu9Zn`Ou zWpb;cF_u~tX+$bzP#VQ{RF?*EHXLptD1ugCJJnPy8Sq+y7gV7R2wqg_!h>XRLIR_f!&l_a;u~ z`lCb!rBN(Xb+=gLq6+i^-g;riw{CJ_qfp)3p@^JP^aLTgmn0eu3&EK!a;ri=G02Q6 z(&6AB+JB5fX{r0Dh^y<~s+jlpYpU(d6?ayZ)&#$jv|2$jYuIAr1^X1~~59g1M6G&}ltM%mD zbiJP4{o&ss^Y+8lFD!9>_4>o})0f%m?ELl3Z{KETSC7l-rlVu_IGbFpX8(AeEpC5f z4OhQ>dVhYyIxeq=bzI)x&u%x9_57c+$!xuu!-UynHG7fKzt$Z&iO-Za&SXcViGfbaijOPB)X=<@4j+ zWU<^#zEFWr?{<;?G+jL{7jt;>_n7mw)>5PQ8q^^EF2{9^YhGd(`})`U=JxCCPH6M@ zg=@W8QH`vN$^%((v0Q9c%g28S=C;Vb%rkpA4vb3znx!; zuE~FqQ|623<@0(1j;2>GZu)P=xjujY@!h2q>E&OJr^n{?PKBKrZ-e&n^;2g>Ef%6 zdjh5rusUZO?SO#Y3h11afs`5$aE*Z0-kBH%1bi)!RMy^vA%sv1L=6ZP;($P`1$@>v zg)n4vss&uI!NKkUf!qjaqb+K=ydH9_HjRW$UZ?DbB${Yz z=+7ypA&Dm3Dq5Gl8;YK8;;oSB+U6nW>N?wqJpOv-8Kl}sJ)U}3g5L$74&aJ!^U9*&)ZP4yot&W*4=+LHiI{Y z0p8!#2*HOu@bsH^N-3k_Nn^MPL#ZJ`CNx6e12q)AB9&WG8N@J3%IZeSM3BiSDXrR4 zA&s-kw%!mGY}`00-;#1*mT``6*@gBqSOFsCVq3}$@??=}zhMBAjgiv1jTLnqDu`m2 zZA-a4(ymrU*@QMWPO2TRGLlFD&p9y-^S%4S?q+8wNHN0zc_m9o-m zWPQ1|lpk!QMQR5t_;GotTAp_rp%~{w)t1V3q+MZGYGjmkn-29zyId>howaVHU2Pv) zZ${eH_J$34WW3rm@6+G}SESl@`8?7t)yk-(?cgv_s&8^d_~%Fusg(h|w)KY8#^-*=5_dVbaaxLao}(88g<%@jUoz~7nLHBB?}&Lf zvTk<7zzE}#OSfGTvmV*qjoT3md1Sq{!<{fFg3nNLg zT>&X#&l+djj!Q6xj3m5v05I{4tZvitB&tXc9$pl*mP@ccy2Al@_;tGeygjnxgQw58 z%crO5;_mGDm=B9jPEPS3J$Fp^EbG&9HnjcyOi!I`{%>>g@XyKd@&Bxsi;*(?=<)Gt zc0W0NT5L|&o8@ZyFgyLWntz{eW~Wc9+xhJIbTwPgrmNepr|a9*bc5fwZ+<={1*hMh zzdX)wPyc*z`Y>B8pMdkl#?!CQc>T{0`hP!vN5j3Ju3^OS@%(O5!10IW^JefJPrl#Y z!=yh>z*{E@uA-;t#b;P+bv{=4SnC^=DxWG8G970a`_rq7KYjY;{n5u?u8-dSeDvw^ z>gfF4&C#F#a{lf}E~Jj=j_F9w!;cEoPx=THWwJV2t+nwOLgoZjxMjO>&xoh$Ljp5y1uReajZ_13bDeEb{;>-PW(kB0+m)d8Cx}a84JUGCM>FF z9re-8(fKY`Fgf&qA#WKh5}3`_Yh4wIDmheo)hI4tSp&@E7(-MYyHwQ99>5~IoMWiO zKUrmvT~#EikU*FxxvDi2n1_`CU{+;kwFlU)VYKx3V6jygqVeNg^6W}HU14_ec#m2q z!&vXwCWT#C+cwNeqrTB?J=>uSDYYJZONDvI>l2DhQNnsLBO7{sfaZ{(_08_<8{hNx zkuyN0Os}sot&^khc$JgI(iE#&DdE6(E|p}%uIM^`$|~7ILE&sOl5I$Rii)SBhpLEF zDS9PQ+3F^|h}_GZ1Ioi0KVX;1Y*(~oPEa<7VwZ-ADOR;khKbr3hDi#x0_J8UjMW0e z0g3^cSHoDWV^yt_VYn1mYp1Yu>DaLbKIkn}Q`OP3z+qS7dCD^M@v2rzI7!9ix0Rz! zr%JW8xyK%#1c!d`ROY;(=PQiN4i#K$Wgrh1Zo?DIU^=@}E2JE=EtFI9bagaB(TacB zMnfsQ{>5H2z&t&#@8$Iw!6fha_3=FU2@7mhYb78$qNk#F*`xP6!0d(nWp*)~QZ|7* z=zpqY!7L}fsuc=GQkpA9TjLS*{fYs}#lEOJDHzUWkhG;_rsu1eQ8g#bm_@H(WiRxM zI!01=v21^FDl0_5h*l@29uhz+h#>E((F-ilw$X|i$0T^7lgqw8myP6DVy2-Tn2_*% zl?`Uus@6#`S=m%zv=6!yqWFSE)=9R}4~8LY$(xZa3JEJgRjXwrBr^qyC>5=+({56q zJzD7o2U|8KSn?c7Glc9^RjUMw##^D`F$Y96z|b+s1k;-iUO0`jBE*ahzY>88Gddzz z)oKOGC&fK2gUM|-y;DC`>ta%c876wJDtfxo7+DdwS|{V6Uq_Bn41qj%v6l+2FGgaG zulWDyDcGT`;hrP9w)%6d^_z|z7@vAwR@$)87Gm#fEK{(2YOuQbMpr6_ zXn~z(gro!5=ER~S=c)*R<}iTg8cPdSLZbs#rnib+EQiL}!$4)|a2zI&8uIjiBmlF5 zOT-LCZGi&K)=;KXt>EGxgCYTj;sh^h8H(=nDC;mlV%~~}x|})GeI)IKzk{X33o%KT zGt?&#A?Bc>wqQB!Ia)SG(;a)v3&V(zgX47YRxuPk538~Ysj9UWBo7bO@C0?y+v>!W z+?WEDs2VCBtr{D)n&HxIXz~FY`iWFxB3;#?+Zkp$5UDX}I;N_X3J_|GQX3O5mccF^ z^pq(x_O?<%*vSO$LsGA>FHj8?r2+(Z>SB>Q(7%<3iXJ2_9WS!;P(vIR7C?LT(Qo4sox1LP>8_VRjrO8FvtapPzVXluS?k(BE1BZCC@zL zpqX)#AOWOwAbP6x7A$y|v0_3K?#|bEslMt_Ic!so#ssH%`e6z5B8+Ofsuc?u78D}x z96V+u+hqlJa5QO+*Di4fvXX=JJX{!oSk+nuDQqVbkG_SP-Oq{K+g-gdI63SjxXjxl zI;=rAj53snO2$D}@I(hIu`Y!!%)vo^$9cSzT120B6Po0+%e==Gr2-Uma10azimjUy z_JzSD+g6;gx+PN;37CcDLBLqCfMKrlJg0)iRd)-@v_1Fv&&I}oHYLdkUbj)aiHWZ0 z6|9`S94ACf7XAR(OO~bBI5E+^x@%hoTBsV$4Ck%$rZz2i^Sw7OY+3NgEYi453oMq6 zO_@Og2!(=$H928O?gR_}-nS14AIVyzG3#@})AO`PqLivyYe6#B3lci#0dsmL@Oqtq zUCczm$?BSA0!J=5Q`LG4R#3c)N9?!B9N5L3?&T%zOyfQ%&{cV!3M=yBy45RS^yyFo zcA&DHcfGN4z6Jw^k5ze|!BTF(!WN8^0-1s#GwyVd!_C&0HJm0d7ttpu5t4FMqhza@ zk9p{vDI%C1Gcf)2Z)2|EJm>KK6d)1+=OJ#sYV-=2QS37iVa^8{?n}v_B^ek3{7vUc zbuxIYM7pZ=7OY@XJ>c1H8+5$`p7G6AVd(57%SjfmNm=E3z7r}Bn3hU6G;noiPcqIQ z;vzOmdM-9E^g^F`52VLYOHP7Slop^I`4|AhlhN^q+AcQL)*xw2tLsLXLZwvH3KZ%# z%aV@QG2y_H9u6IAGDK>2bOY5~#`Js@XdkZ{tp&=dW{^y#lYzGPtVtCvV$)p4Qe^m; zgEmbXgQ`X=L(%7cse$UQwDvGRFqVW$&Qq{Dd(D|h1fl5yx*5fjq(zL18zS*82DMHC zs(5&@Ws;TZmFD3BQc+H8d9x*mSk;OJjP9=Fpo3V@cAOZa@12s$jv;5m0rRV((2+!y zqt!G9HkuhO#lXWqmq9u((1HVcPls*m;4aaJU{w*QkbSa%7QKR%O)#||yX%1JKfB{G zf)698Pi6iSo!nu=h&;d)os5%RWWwOT3kNpBmyU{JD>)p9hi5eI@*1N%cdDXSu%fl~ zij!#g+q20Z-qV^jjeDW7Hpvz$YZ_o{a!SG(Ia|joH7Yu4)!xzyQnr1OG6hP`+Ob*k zpGlxH7V7}-Q=*r!G+!1|5@M^%n9BC~Nxjg1m7E#rvRPTTT}Y$_n-nr)a^H;;vnEl{{ZzqEXySidIIVPkZvVy1eV=Mjwt?qb1`#eLlv+dPNF# zT~*L1Fm(9Fq^mOe&Vl)*&m7qixkVbnv4Wzr_MvJN3QA6%slpmwUsqt_$TGNyUdA%YS-r)I`oJ*Tj};^#xEa%;E=F;t)0fE16Gg9JX-o&@xDvSC zbVvvHL{(qYZmx$%j;Hd)_Hf!1R3 zi_$$n5>#8w@lNqx4$;)!<1D`!>%G{liJm@`*NyjR0dJHt5;)4l#wOn(zut~mNH*}y zNuRvP$t77t%-g6G9B+hUqoF&z4ovL7EOX5{l64_*Vc~I(5xcJF6daJBl?=}ykOM~s zp)bQF8Q2PpK*F;0IJI2Q(?n6qKn`WgyG;UrsL+WJrr=&4FC#54*+!XY35oWrB$nN%8lkSk4g*mf(RjvJ(+pjB{!W*h@%q|Yk$(=WZy-ODX1k40FhJXdlm z+qDp1Q7cf%#8_WF9XK_3`EY_So-};~m`d4#Ew*lhzJeic!vF)*b@wK{e-{wHX(f|u z2y)>?0u_Ac{fV3_vR=jtMhc2AJ5uNd>BVDaF~=C$q}UwmY*nih2n;TAq@%q{-6NfY zw{^|%fR;m{!7$9je`5CbJP8$zg5oV-athDxUU0v7@}`;-=ETPZXgKn1jhwvcC_QPb zn9;y}e3Xr?AUOL&lZsq6lm*+PhM17bsA8}X^5GHd6{z6(N{!-2Ct{uCe&JGD*9Wq+ zqZu!sBG9cg)?X~|Rjrnx5-7JxyzZ*j-<3>Lu5`)jwq424w{3OGi9soNWoi`P>E|!; zLZKh-g4RSGTBil@Q*bqwG{6LR`Wh>?`^CYp9?AiWd`h8`2X9jc&$-qcfBJu+&28 Date: Tue, 26 May 2020 17:25:47 +0100 Subject: [PATCH 10/12] now using pygwasvcf --- ldscore/parse.py | 111 ++++++++++++++++++++++++----------------------- 1 file changed, 57 insertions(+), 54 deletions(-) diff --git a/ldscore/parse.py b/ldscore/parse.py index b8cdf6d2..d23f312c 100644 --- a/ldscore/parse.py +++ b/ldscore/parse.py @@ -10,7 +10,7 @@ import pandas as pd import os import glob -from pysam import VariantFile +import pygwasvcf import gzip @@ -59,9 +59,13 @@ def which_compression(fh): def get_compression(fh): '''Which sort of compression should we use with read_csv?''' if fh.endswith('vcf.gz'): - compression = 'bcf' + compression = 'vcf' + elif fh.endswith('vcf'): + compression = 'vcf' elif fh.endswith('bcf'): - compression = 'bcf' + compression = 'vcf' + elif fh.endswith('bcf.gz'): + compression = 'vcf' elif fh.endswith('gz'): compression = 'gzip' elif fh.endswith('bz2'): @@ -90,11 +94,11 @@ def sumstats(fh, alleles=False, dropna=True, slh=None): if alleles: usecols += ['A1', 'A2'] - if compression == 'bcf': + if compression == 'vcf': try: x = read_vcf(fh, alleles, slh) except (AttributeError, ValueError) as e: - raise ValueError('Improperly formatted bcf/vcf file: ' + str(e.args)) + raise ValueError('Improperly formatted VCF/BCF file: ' + str(e.args)) else: try: x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression) @@ -107,61 +111,60 @@ def sumstats(fh, alleles=False, dropna=True, slh=None): return x +""" +Function to read GWAS summary statistics from GWAS-VCF +:param fh: Path to GWAS-VCF +:param alleles: Bool value to include alleles in output +:param slh: SNP list (optional) +:param trait: Name of trait to select from GWAS-VCF. If None then assume a single GWAS is present in the file +""" +def read_vcf(fh, alleles, slh=None, trait=None): + with pygwasvcf.GwasVcf(fh) as vcf_in: + traits = vcf_in.get_traits() -def read_vcf(fh, alleles, slh=None): - vcf_in = VariantFile(fh) - sample = list(vcf_in.header.samples)[0] - availcols = next(vcf_in.fetch()).format.keys() - vcf_in.seek(0) - - # Check if sample size info is in header - global_fields = [x for x in vcf_in.header.records if x.key == "SAMPLE"][0] - if alleles: - dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str} - usecols = list(dtype_dict.keys()) - - # Read in data - if 'SS' in availcols: - o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0], rec.alts[0], rec.ref] for rec in vcf_in.fetch()] - N = pd.Series([x[2] for x in o], dtype='float') + if trait is not None: + assert trait in traits else: - o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0], rec.alts[0], rec.ref] for rec in vcf_in.fetch()] - if 'TotalControls' in global_fields.keys() and 'TotalCases' in global_fields.keys(): - N = pd.Series([float(global_fields['TotalControls']) + float(global_fields['TotalCases'])] * len(o), dtype='float') - elif 'TotalControls' in global_fields.keys(): - N = pd.Series([float(global_fields['TotalControls'])] * len(o), dtype='float') - else: - N = pd.Series([np.NaN] * len(o), dtype='float') - - p = pd.DataFrame( - {'SNP': pd.Series([x[0] for x in o], dtype='str'), - 'Z': pd.Series([x[1] for x in o], dtype='float'), - 'N': N, - 'A1': pd.Series([x[2 + int('SS' in availcols)] for x in o], dtype='str'), - 'A2': pd.Series([x[3 + int('SS' in availcols)] for x in o], dtype='str')} - ) - else: - dtype_dict = {'SNP': str, 'Z': float, 'N': float} - usecols = list(dtype_dict.keys()) - if 'SS' in availcols: - o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0]] for rec in vcf_in.fetch()] + trait = traits[0] + + # get global field info from header + metadata = vcf_in.get_metadata() + + if alleles: + dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str} + + # Read in data + o = [[ + pygwasvcf.VariantRecordGwasFuns.get_id(rec, trait, create_if_missing=True), # rsid or chr-pos-ref-alt + pygwasvcf.VariantRecordGwasFuns.get_beta(rec, trait) / pygwasvcf.VariantRecordGwasFuns.get_se(rec, trait), + pygwasvcf.VariantRecordGwasFuns.get_ss(rec, trait, metadata), # if per-snp sample size unavailable then take from header + rec.alts[0], + rec.ref + ] for rec in vcf_in.query()] N = pd.Series([x[2] for x in o], dtype='float') + + p = pd.DataFrame( + {'SNP': pd.Series([x[0] for x in o], dtype='str'), + 'Z': pd.Series([x[1] for x in o], dtype='float'), + 'N': N, + 'A1': pd.Series([x[3] for x in o], dtype='str'), + 'A2': pd.Series([x[4] for x in o], dtype='str')} + ) else: - o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0]] for rec in vcf_in.fetch()] - if 'TotalControls' in global_fields.keys() and 'TotalCases' in global_fields.keys(): - N = pd.Series([float(global_fields['TotalControls']) + float(global_fields['TotalCases'])] * len(o), dtype='float') - elif 'TotalControls' in global_fields.keys(): - N = pd.Series([float(global_fields['TotalControls'])] * len(o), dtype='float') - else: - N = pd.Series([np.NaN] * len(o), dtype='float') + dtype_dict = {'SNP': str, 'Z': float, 'N': float} - p = pd.DataFrame( - {'SNP': pd.Series([x[0] for x in o], dtype='str'), - 'Z': pd.Series([x[1] for x in o], dtype='float'), - 'N': N} - ) + o = [[ + pygwasvcf.VariantRecordGwasFuns.get_id(rec, trait, create_if_missing=True), # rsid or chr-pos-ref-alt + pygwasvcf.VariantRecordGwasFuns.get_beta(rec, trait) / pygwasvcf.VariantRecordGwasFuns.get_se(rec, trait), + pygwasvcf.VariantRecordGwasFuns.get_ss(rec, trait, metadata), # if per-snp sample size unavailable then take from header + ] for rec in vcf_in.fetch()] + N = pd.Series([x[2] for x in o], dtype='float') - vcf_in.close() + p = pd.DataFrame( + {'SNP': pd.Series([x[0] for x in o], dtype='str'), + 'Z': pd.Series([x[1] for x in o], dtype='float'), + 'N': N} + ) if slh is not None: compression = get_compression(slh) From 967dbf4d14b44ec6b3056687288ebac6ea4b47fd Mon Sep 17 00:00:00 2001 From: gibran hemani Date: Fri, 19 Jun 2020 10:58:16 +0100 Subject: [PATCH 11/12] Revert "pygwasvcf" --- ldscore/parse.py | 111 +++++++++++++++++++++++------------------------ 1 file changed, 54 insertions(+), 57 deletions(-) diff --git a/ldscore/parse.py b/ldscore/parse.py index d23f312c..b8cdf6d2 100644 --- a/ldscore/parse.py +++ b/ldscore/parse.py @@ -10,7 +10,7 @@ import pandas as pd import os import glob -import pygwasvcf +from pysam import VariantFile import gzip @@ -59,13 +59,9 @@ def which_compression(fh): def get_compression(fh): '''Which sort of compression should we use with read_csv?''' if fh.endswith('vcf.gz'): - compression = 'vcf' - elif fh.endswith('vcf'): - compression = 'vcf' + compression = 'bcf' elif fh.endswith('bcf'): - compression = 'vcf' - elif fh.endswith('bcf.gz'): - compression = 'vcf' + compression = 'bcf' elif fh.endswith('gz'): compression = 'gzip' elif fh.endswith('bz2'): @@ -94,11 +90,11 @@ def sumstats(fh, alleles=False, dropna=True, slh=None): if alleles: usecols += ['A1', 'A2'] - if compression == 'vcf': + if compression == 'bcf': try: x = read_vcf(fh, alleles, slh) except (AttributeError, ValueError) as e: - raise ValueError('Improperly formatted VCF/BCF file: ' + str(e.args)) + raise ValueError('Improperly formatted bcf/vcf file: ' + str(e.args)) else: try: x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression) @@ -111,60 +107,61 @@ def sumstats(fh, alleles=False, dropna=True, slh=None): return x -""" -Function to read GWAS summary statistics from GWAS-VCF -:param fh: Path to GWAS-VCF -:param alleles: Bool value to include alleles in output -:param slh: SNP list (optional) -:param trait: Name of trait to select from GWAS-VCF. If None then assume a single GWAS is present in the file -""" -def read_vcf(fh, alleles, slh=None, trait=None): - with pygwasvcf.GwasVcf(fh) as vcf_in: - traits = vcf_in.get_traits() - if trait is not None: - assert trait in traits +def read_vcf(fh, alleles, slh=None): + vcf_in = VariantFile(fh) + sample = list(vcf_in.header.samples)[0] + availcols = next(vcf_in.fetch()).format.keys() + vcf_in.seek(0) + + # Check if sample size info is in header + global_fields = [x for x in vcf_in.header.records if x.key == "SAMPLE"][0] + if alleles: + dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str} + usecols = list(dtype_dict.keys()) + + # Read in data + if 'SS' in availcols: + o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0], rec.alts[0], rec.ref] for rec in vcf_in.fetch()] + N = pd.Series([x[2] for x in o], dtype='float') else: - trait = traits[0] - - # get global field info from header - metadata = vcf_in.get_metadata() - - if alleles: - dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str} - - # Read in data - o = [[ - pygwasvcf.VariantRecordGwasFuns.get_id(rec, trait, create_if_missing=True), # rsid or chr-pos-ref-alt - pygwasvcf.VariantRecordGwasFuns.get_beta(rec, trait) / pygwasvcf.VariantRecordGwasFuns.get_se(rec, trait), - pygwasvcf.VariantRecordGwasFuns.get_ss(rec, trait, metadata), # if per-snp sample size unavailable then take from header - rec.alts[0], - rec.ref - ] for rec in vcf_in.query()] + o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0], rec.alts[0], rec.ref] for rec in vcf_in.fetch()] + if 'TotalControls' in global_fields.keys() and 'TotalCases' in global_fields.keys(): + N = pd.Series([float(global_fields['TotalControls']) + float(global_fields['TotalCases'])] * len(o), dtype='float') + elif 'TotalControls' in global_fields.keys(): + N = pd.Series([float(global_fields['TotalControls'])] * len(o), dtype='float') + else: + N = pd.Series([np.NaN] * len(o), dtype='float') + + p = pd.DataFrame( + {'SNP': pd.Series([x[0] for x in o], dtype='str'), + 'Z': pd.Series([x[1] for x in o], dtype='float'), + 'N': N, + 'A1': pd.Series([x[2 + int('SS' in availcols)] for x in o], dtype='str'), + 'A2': pd.Series([x[3 + int('SS' in availcols)] for x in o], dtype='str')} + ) + else: + dtype_dict = {'SNP': str, 'Z': float, 'N': float} + usecols = list(dtype_dict.keys()) + if 'SS' in availcols: + o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0]] for rec in vcf_in.fetch()] N = pd.Series([x[2] for x in o], dtype='float') - - p = pd.DataFrame( - {'SNP': pd.Series([x[0] for x in o], dtype='str'), - 'Z': pd.Series([x[1] for x in o], dtype='float'), - 'N': N, - 'A1': pd.Series([x[3] for x in o], dtype='str'), - 'A2': pd.Series([x[4] for x in o], dtype='str')} - ) else: - dtype_dict = {'SNP': str, 'Z': float, 'N': float} + o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0]] for rec in vcf_in.fetch()] + if 'TotalControls' in global_fields.keys() and 'TotalCases' in global_fields.keys(): + N = pd.Series([float(global_fields['TotalControls']) + float(global_fields['TotalCases'])] * len(o), dtype='float') + elif 'TotalControls' in global_fields.keys(): + N = pd.Series([float(global_fields['TotalControls'])] * len(o), dtype='float') + else: + N = pd.Series([np.NaN] * len(o), dtype='float') - o = [[ - pygwasvcf.VariantRecordGwasFuns.get_id(rec, trait, create_if_missing=True), # rsid or chr-pos-ref-alt - pygwasvcf.VariantRecordGwasFuns.get_beta(rec, trait) / pygwasvcf.VariantRecordGwasFuns.get_se(rec, trait), - pygwasvcf.VariantRecordGwasFuns.get_ss(rec, trait, metadata), # if per-snp sample size unavailable then take from header - ] for rec in vcf_in.fetch()] - N = pd.Series([x[2] for x in o], dtype='float') + p = pd.DataFrame( + {'SNP': pd.Series([x[0] for x in o], dtype='str'), + 'Z': pd.Series([x[1] for x in o], dtype='float'), + 'N': N} + ) - p = pd.DataFrame( - {'SNP': pd.Series([x[0] for x in o], dtype='str'), - 'Z': pd.Series([x[1] for x in o], dtype='float'), - 'N': N} - ) + vcf_in.close() if slh is not None: compression = get_compression(slh) From 2541295e25eccfcaa67fe5f647ed28b43a836a94 Mon Sep 17 00:00:00 2001 From: Gibran Hemani Date: Fri, 19 Jun 2020 13:51:30 +0100 Subject: [PATCH 12/12] updates --- ldscore/parse.py | 4 +++- requirements.txt | 1 + test/test_parse.py | 15 +++++++-------- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/ldscore/parse.py b/ldscore/parse.py index b8cdf6d2..b4c56c81 100644 --- a/ldscore/parse.py +++ b/ldscore/parse.py @@ -58,7 +58,9 @@ def which_compression(fh): def get_compression(fh): '''Which sort of compression should we use with read_csv?''' - if fh.endswith('vcf.gz'): + if fh.endswith('vcf'): + compression = 'bcf' + elif fh.endswith('vcf.gz'): compression = 'bcf' elif fh.endswith('bcf'): compression = 'bcf' diff --git a/requirements.txt b/requirements.txt index cc076b03..dab9642e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ pybedtools>=0.7,<0.8 scipy>=0.18,<0.19 numpy>=1.16,<1.17 pandas>=0.20,<0.21 +pysam diff --git a/test/test_parse.py b/test/test_parse.py index 4dbd9265..709ee204 100644 --- a/test/test_parse.py +++ b/test/test_parse.py @@ -46,18 +46,17 @@ def test_read_sumstats(): DIR, 'parse_test/test.l2.ldscore.gz')) def test_read_vcf(): - x1 = ps.read_vcf("test/vcf_test/example1.vcf.gz", alleles=True) - x2 = ps.read_vcf("test/vcf_test/example1.vcf.gz", alleles=False) - x3 = ps.read_vcf("test/vcf_test/example2.vcf.gz", alleles=True) - x4 = ps.read_vcf("test/vcf_test/example2.vcf.gz", alleles=False) - x5 = ps.read_vcf("test/vcf_test/example3.vcf.gz", alleles=True) - x6 = ps.read_vcf("test/vcf_test/example3.vcf.gz", alleles=False) + x1 = ps.read_vcf(os.path.join(DIR, "vcf_test/example1.vcf.gz"), alleles=True) + x2 = ps.read_vcf(os.path.join(DIR, "vcf_test/example1.vcf.gz"), alleles=False) + x3 = ps.read_vcf(os.path.join(DIR, "vcf_test/example2.vcf.gz"), alleles=True) + x4 = ps.read_vcf(os.path.join(DIR, "vcf_test/example2.vcf.gz"), alleles=False) + x5 = ps.read_vcf(os.path.join(DIR, "vcf_test/example3.vcf.gz"), alleles=True) + x6 = ps.read_vcf(os.path.join(DIR, "vcf_test/example3.vcf.gz"), alleles=False) assert_equal(len(x1), len(x2)) assert_equal(len(x3), len(x4)) assert_equal(len(x5), len(x6)) assert('rs' in x1.SNP[0]) - assert_raises(ValueError, ps.read_vcf, os.path.join( - DIR, 'test/vcf_test/example1.vcf.gz')) + # assert_raises(IOError, ps.read_vcf, os.path.join(DIR, 'test/vcf_test/example1.vcf.gz'), alleles=True) def test_frq_parser(): x = ps.frq_parser(os.path.join(DIR, 'parse_test/test1.frq'), compression=None)