-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathsumStats2ref.py
377 lines (349 loc) · 14.3 KB
/
sumStats2ref.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
import numpy as np
import pandas as pd
import scipy.stats as stats
import scipy.io as sio
import os, sys, argparse, time, logging, getpass
import matplotlib.pyplot as plt
from GWAS_IO.summary_stats_Utils import *
def read_sum_dat(sumFile, logger, kargs):
'''
Read give summary statistics.
Input:
sumFile, Path of summary file.
logger, Python logger for process information.
kargs, namespace object for options.
Return:
-------
sumDat, DataFrame of summary dataset.
Note:
-----
1. Field names (if exists) will be standardize according to bellow
effCol, Beta
ORCol, OR
effACol, A1
othACol, A2
posCol, POS
infoCol, INFO
NCol, N
And, Chromosome names will be standardized
Removing'CHR', 'Chr', etc --> integer
recode chrX--> 23
recode chrY--> 24
recode chrM--> 25
2. SNPs with invalid p values removed, i.e., >1 or < 0, or NAN
3. Duplicated SNPs removed
'''
if not os.access(sumFile, os.R_OK):
raise ValueError("Can't read summary stats file: {}".format(sumFile))
logger.info('*** Loading summary stats ***')
logger.info('Read summary data from {}'.format(sumFile))
sumDat = read_sumdata(sumFile, kargs.snpCol, kargs.pCol, kargs)
logger.info('......')
logger.info('Read {} SNPs'.format(sumDat.shape[0]))
colnames = ['SNP', 'P', 'A1', 'CHR', 'POS', 'Beta', 'A2']
if 'P' not in sumDat.columns:
raise RuntimeError('No P value provided')
if 'SNP' not in sumDat.columns:
raise RuntimeError('No SNP ID provided')
if not kargs.effACol:
warnings.warn('No effective Allele provided')
logger.warning('No effective Allele provided')
colnames.remove('A1')
if not kargs.othACol:
warnings.warn( "No other Allele information provided")
logger.warn('No effective Allele provided')
colnames.remove('A2')
if not kargs.effCol:
if not kargs.orCol:
colnames.remove('Beta')
logger.warn('Directionality is not checked')
else:
sumDat.loc[:, 'Beta'] = np.log(sumDat.loc[:, 'OR'])
sumDat.drop('OR',axis=1, inplace=True)
if (not kargs.effACol) and (not kargs.othACol):
logger.warn('Directionality is not checked')
colnames.remove('Beta')
sumDat.drop('Beta', axis=1, inplace=True)
if (not kargs.posCol) or (not kargs.chrCol):
logger.info('Using SNP ID only for align Summary data to reference')
colnames.remove('POS')
colnames.remove('CHR')
keys = ['SNP']
elif kargs.forceID:
keys = ['SNP']
else:
keys = ['CHR', 'POS']
if kargs.NCol:
colnames.append('N')
logger.info('Reading Summary stats done\n')
logger.info('**** check P values *****')
sumDat = basic_QC_P(sumDat, kargs.outdir, 'P', logger)
logger.info('**** END check P values *****')
logger.info('**** check duplicated SNPs *****')
sumDat, dup = deduplcate_sum(sumDat, 'P', keys)
print (sumDat.head())
if dup.shape[0] > 0:
dupFile = os.path.join(kargs.outdir, 'Duplicated_snps.gz')
logger.warning('There are {} duplicated SNPs in {}'.format(
dup.shape[0], sumFile))
logger.warning('\t The SNP with minimum p value included')
logger.warning('see all duplicated SNPs in {}'.format(dupFile))
dup.to_csv(dupFile, index=False, na_rep='NA', compression='gzip',
sep='\t')
logger.info('**** END check duplicated SNPs *****')
sumDat = sumDat.loc[:, colnames]
print (sumDat.head())
logger.info('\n')
return sumDat
def read_ref_dat(refFile, logger):
'''
Read in-house reference dataset.
Input:
------
refFile, Path of reference file:
CHR, SNP, GP, BP, A1, A2, complementA1, complementA2
logger, Python logger for process information
Return:
------
refDat, DataFrame of reference dataset.
'''
if not os.access(refFile, os.R_OK):
raise ValueError("Can't read reference file: {}".format(refFile))
logger.info('*** Loading reference data ***')
refDat = pd.read_csv(refFile)
refDat.rename(columns={'BP':'POS', 'A1':'refA1', 'A2':'refA2'},
inplace=True)
logger.info('Read reference data from {}'.format(refFile))
logger.info('Read {} SNPs from reference data'.format(refDat.shape[0]))
print ('*** Using reference with {} SNPs ***'.format(refDat.shape[0]))
logger.info('Reading reference data done\n')
logger.info('\n')
return(refDat)
def _qq(pvec, ax):
'''
Making basic QQ plots of pvalues.
'''
pvec = pvec[np.isfinite(pvec)]
pvec[pvec < 1e-20] = 1e-20
logpSort = -np.log10(np.sort(pvec))
n = logpSort.shape[0]
logpTheo = -np.log10(np.cumsum(np.repeat(1.0/n, n)))
ax.scatter(logpTheo, logpSort)
x = np.linspace(*ax.get_xlim())
ax.plot(x, x)
plt.xlabel('Theorectial -log10 (P)')
plt.ylabel('Observed -log10 (P)')
def summarize_merge(sumDat, mDat, misDat, outdir, logger):
'''
Making QQ plot of original dataset, converted and missed.
Input:
------
sumDat, DataFrame of Original summary stats
mDat, DataFrame of Converted summary data
misDat, DataFrame of SNPs in original but not in converted
outdir, Where to save figure
logger, Python logger for process information
No return.
----------
TO-DO:
Making multiple curves in one figure
'''
logger.info('\n')
if sumDat.shape[0] < 10:
logger.erro('Too few SNPs converted!! N={}'.format(sumDat.shape[0]))
raise (RuntimeError,
'Too few SNPs converted!! N={}'.format(sumDat.shape[0]))
fig = plt.figure(facecolor='white')
ax = fig.add_subplot(131)
_qq(sumDat.loc[:,'P'].values, ax)
plt.title('Original')
ax = fig.add_subplot(132)
_qq(mDat.loc[:,'P'].values, ax)
plt.title("Converted")
ax = fig.add_subplot(133)
_qq(misDat.loc[:,'P'].values, ax)
plt.title("Missed")
plt.tight_layout()
plt.savefig(os.path.join(outdir, 'QQ_convert.png'), format='png')
plt.close()
logger.info('Comparing P values in QQ_convert.png')
def check_zscore(zvec, outdir, logger):
'''
Check distribution of converted z-score(real not Anders')
Input:
------
outdir, Where to save figure
logger, Python logger for process information
No return.
'''
logger.info('\n')
fig = plt.figure(facecolor='white')
pd.Series(zvec[np.isfinite(zvec)]).hist(bins=100)
plt.title('Z-Scores')
plt.tight_layout()
plt.savefig(os.path.join(outdir, 'Z_scores.png'), format='png')
plt.close()
logger.info('Check converted Z-scores at Z_scores.png')
def align2ref(sumDat, refDat, logger, kargs):
'''
Align given summary Data to in-house reference dataset.
Input:
------
sumDat, DataFrame of summary statistics.
refDat, DataFrame of in-house reference dataset.
logger, Python logger for process information
kargs, NameSpace object of options
Return:
-------
-log10 p values, and z-scores
Note:
-----
1. Ambiguous SNPs removed based on in-house reference dataset.
2. effect aligned with allele coding of reference
'''
if kargs.forceID:
keys = ['SNP']
elif ('CHR' not in sumDat.columns) or ('POS' not in sumDat.columns):
keys = ['SNP']
else:
keys = ['CHR', 'POS']
mDat, misDat1 = map_snps(refDat, sumDat, keys, 'sum', False)
mDat.to_csv(os.path.join(kargs.outdir, 'debug_merged.txt.gz'),
sep='\t', index=False, na_rep='NA')
logger.info('*** Align SNPs to reference ***')
if misDat1.shape[0] > 0:
outF = os.path.join(kargs.outdir, 'SNPs_not_in_sumFile.txt.gz')
logger.info(
'There are {} SNPs in reference not in given summary file'.format(
misDat1.shape[0]))
logger.info('Details see {}'.format(outF))
misDat1.to_csv(outF, index=False, sep='\t', compression='gzip',
na_rep='NA')
dummy, misDat2 = map_snps(sumDat, refDat, keys, 'ref')
if misDat2.shape[0] > 0:
outF = os.path.join(kargs.outdir, 'SNPs_not_in_refFile.txt.gz')
logger.info(
'There are {} SNPs in summary file not in reference'.format(
misDat2.shape[0]))
logger.info('Details see {}'.format(outF))
misDat2.to_csv(outF, index=False, sep='\t', compression='gzip',
na_rep='NA')
signvec = np.empty((mDat.shape[0],), dtype='float'); signvec.fill(np.nan)
ambivec = (((mDat.refA1=='A')&(mDat.refA2=='T')) |
((mDat.refA2=='A')&(mDat.refA1=='T')) |
((mDat.refA1=='C')&(mDat.refA2=='G')) |
((mDat.refA2=='C')&(mDat.refA1=='G')))
ambivec = ambivec.values
logger.info('{} SNPs have ambiguously coded allele in ref'. format(
np.sum(ambivec)))
logger.info('Zscores of ambiguously coded SNPs were set to NaN')
ambDat = mDat.loc[ambivec,:]
ambDat.to_csv(os.path.join(kargs.outdir, 'Ambiguous_data.txt.gz'),
compression='gzip', sep='\t', index=False, na_rep='NA')
logger.info('Save SNPs with ambiguous allele coding into {}'.format(
os.path.join(kargs.outdir, 'Ambiguous_data.txt.gz')))
logpvec = -np.log10(mDat.loc[:,'P'])
if 'A1' not in sumDat.columns:
zvec = signvec.copy()
else:
if 'A2' not in sumDat.columns:
idx1 = ((mDat.A1==mDat.refA1) | (mDat.A1==mDat.A1c)).values
idx_1 = ((mDat.A1==mDat.refA2) | (mDat.A1==mDat.A2c)).values
else:
idx1 = (((mDat.A1==mDat.refA1)&(mDat.A2==mDat.refA2)) | ((mDat.A1==mDat.A1c)&(mDat.A2==mDat.A2c))).values
idx_1 = (((mDat.A1==mDat.refA2)&(mDat.A2==mDat.refA1)) | ((mDat.A1==mDat.A2c)&(mDat.A2==mDat.A1c))).values
signvec[idx1] = 1.0; signvec[idx_1] = -1.0; signvec[ambivec] = np.nan
signvec = signvec * np.sign(mDat.loc[:,'Beta'].values)
zvec = np.abs(stats.norm.ppf(mDat.loc[:,'P'].values * 0.5)) * signvec
logger.info('{} SNPs have direction opposite to refference and changed'.format(np.sum(idx_1)))
mDat.loc[:, 'newZ'] = zvec
tmpMdat = mDat.loc[idx_1 ,:]
tmpMdat.to_csv(os.path.join(kargs.outdir, 'flip_data.txt.gz'),
index=False, sep='\t', compression='gzip',na_rep='NA')
print mDat.columns
summarize_merge(sumDat, mDat, misDat2, kargs.outdir, logger)
logger.info('\n')
if kargs.NCol:
print 'I am here'
print mDat.head()
return(logpvec.values, zvec, mDat.loc[:,'N'].values)
else:
return(logpvec.values, zvec, [])
def save2mat(logpvec, zvec, Nvec, trait, outdir, logger):
'''
Save data in Matlab dataset.
Input:
-----
logpvec, -log10 p value vector
zvec, zscore vector
trait, Name of phenotype
outdir, Where to save dataset
logger, Python logger for process information
No return.
'''
outfile = os.path.join(outdir, trait)
if len(Nvec) == len(logpvec):
print (np.sum(np.isfinite(Nvec)))
print (np.sum(np.isfinite(logpvec)))
tmpdict = {'logpvec_'+trait.lower():logpvec,
'zvec_'+trait.lower():zvec, 'nvec_'+trait.lower():Nvec}
else:
tmpdict = {'logpvec_'+trait.lower():logpvec, 'zvec_'+trait.lower():zvec}
sio.savemat(outfile, tmpdict, format='5', do_compression=False,
oned_as='column')
logger.info('Save converted data to {}'.format(outfile+'.mat'))
def convert_sum():
parser = argparse.ArgumentParser(prog="Preprocess Summary stats",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='Preprocess summary stats for matlab')
parser.add_argument('--sumFile', type=str, help='Summary stats file')
parser.add_argument('--ref', type=str, help='Reference file')
parser.add_argument('--trait', type=str, help='Trait Name')
parser.add_argument('--outdir', type=str, help='Output DIR', default=".")
parser.add_argument('--forceID', action='store_true', default=False,
help='Force using SNP ID other than position')
parser.add_argument('--snpCol', type=str, help='SNP ID field',
default='SNP')
parser.add_argument('--pCol', type=str, help='P value field', default='P')
parser.add_argument('--effACol', type=str, help='Effective allele field',
default=None)
parser.add_argument('--othACol', type=str, help='The other allele field',
default=None)
parser.add_argument('--effCol', type=str, help='Effect size field',
default=None)
parser.add_argument('--orCol', type=str, help='Odds ratio field',
default=None)
parser.add_argument('--NCol', type=str, help='sample size per SNP',
default=None)
parser.add_argument('--posCol', type=str,
help='Genomic position field',default=None)
parser.add_argument('--chrCol', type=str,
help='Chromosome field',default=None)
args = parser.parse_args()
if not os.access(args.outdir, os.F_OK):
os.mkdir(args.outdir)
if not os.access(args.sumFile, os.R_OK):
raise ValueError("Can't read summary stats file: {}".format(args.sumFile))
if not os.access(args.ref, os.R_OK):
raise ValueError("Can't read reference file: {}".format(args.ref))
logfile = os.path.join(args.outdir, 'convert_' + args.trait + '.log')
logger = logging.getLogger()
logger.addHandler(logging.FileHandler(logfile,mode='w'))
logger.setLevel(logging.DEBUG)
sumDat = read_sum_dat(args.sumFile, logger, args)
refDat = read_ref_dat(args.ref, logger)
logpvec, zvec, Nvec = align2ref(sumDat, refDat, logger, args)
check_zscore(zvec, args.outdir, logger)
save2mat(logpvec, zvec, Nvec, args.trait, args.outdir, logger)
logger.info('\n**********\nFinished at {}'.format(time.ctime()))
logger.info('Author: {} at {}'.format(getpass.getuser(), time.ctime()))
if __name__ == "__main__":
import time
import numpy as np
tsts = time.time()
convert_sum()
print
print 'Finish at %s' % time.ctime()
ted = time.time()
print 'Time taken %d mins %d sec' % ((ted-tsts)//60, np.round(ted-tsts) %
60)