1
1
import math
2
2
from collections import Counter , defaultdict
3
3
import numpy as np
4
- from scipy .stats import binom , norm
4
+ from scipy .stats import binom , norm , chi2
5
5
from pandas import DataFrame
6
6
import sys
7
+ import random
8
+ from math import pow
9
+ from itertools import ifilterfalse , islice
10
+ from scipy .misc import comb
11
+
7
12
8
13
import GeminiQuery
9
14
@@ -59,7 +64,7 @@ def get_calpha(args):
59
64
m = len (vig .keys ())
60
65
61
66
# m_n is the number of variants with n copies (i.e., samples with the variant)
62
- m_n = Counter ([len (x ) for x in vig .values ()])
67
+ # m_n = Counter([len(x) for x in vig.values()])
63
68
64
69
# n_i is a list reflecting the total number of samples
65
70
# having each variant
@@ -81,22 +86,69 @@ def get_calpha(args):
81
86
T = _calculate_T (m , p_0 , n_i , y_i )
82
87
83
88
# Calculate the variance of T in order to normalize it
84
- c = _calculate_c (m_n , p_0 )
89
+ c = _calculate_c (n_i , p_0 )
85
90
86
91
# The final test statistic, Z, id just the original test statistic divided
87
92
# by its standard deviation. "We reject the null when Z is larger than expected
88
93
# using a one-tailed standard normal distribution for reference.
89
94
if c == 0 :
90
95
Z = np .NaN
96
+ p_value = np .NaN
97
+ print "\t " .join ([gene , str (T ), str (c ), str (Z ), str (p_value )])
98
+ continue
91
99
else :
92
100
Z = T / math .sqrt (c )
93
101
94
- # sf is the survival function ... same as 1 - CDF.
95
- p_value = norm .sf (Z )
96
- # alternatie p-value 1 - scipy.stats.chi2.cdf(T**2/c, 1)
102
+ if args .permutations == 0 :
103
+ # sf is the survival function ... same as 1 - CDF.
104
+ p_value = norm .sf (Z )
105
+ else :
106
+ # this permutes the cases without replacement, important for
107
+ # calculating an exact p-value
108
+ perms = permute_cases (samples , args .permutations , case )
109
+ T_scores = []
110
+ for perm_case in perms :
111
+ y_i = [len (filter (lambda a : a in perm_case , x )) for x in vig .values ()]
112
+ T_permuted = _calculate_T (m , p_0 , n_i , y_i )
113
+ T_scores .append (T_permuted )
114
+ false_hits = sum ([x >= T for x in T_scores ])
115
+ # the + 1 to make it an unbiased estimator
116
+ # Permutation P-values Should Never Be Zero: Calculating Exact
117
+ # P-values When Permutations Are Randomly Drawn
118
+ # http://www.degruyter.com/view/j/sagmb.2010.9.1/sagmb.2010.9.1.1585/sagmb.2010.9.1.1585.xml
119
+ p_value = (float (false_hits ) + 1 ) / (float (args .permutations + 1 ))
120
+
97
121
print "\t " .join ([gene , str (T ), str (c ), str (Z ), str (p_value )])
98
122
99
123
124
+ def permute_cases (samples , permutations , case ):
125
+ max_permutations = comb (len (samples ), len (case ))
126
+ if permutations > max_permutations :
127
+ sys .stderr .write ("Permutations set to greater than the maximum number of "
128
+ "unique permutations of cases labels. Setting it to "
129
+ "%d\n ." % (max_permutations ))
130
+ permutations = max_permutations
131
+
132
+ perms = take (permutations , unique_permutations (samples , len (case )))
133
+ return perms
134
+
135
+ def unique_permutations (iterable , length ):
136
+ """
137
+ returns random permutations from an iterable without repeating a set
138
+ take(unique_permutations([1,2,3,4,5], 2), 3) => [3,4], [1,6], [3,5]
139
+ """
140
+ seen = set ()
141
+ while True :
142
+ element = tuple (sorted (random .sample (iterable , length )))
143
+ if element not in seen :
144
+ seen .add (element )
145
+ yield list (element )
146
+
147
+ def take (n , iterable ):
148
+ "Return first n items of the iterable as a list"
149
+ return list (islice (iterable , n ))
150
+
151
+
100
152
def _get_case_and_control_samples (args ):
101
153
query = ("SELECT * from samples" )
102
154
gq = GeminiQuery .GeminiQuery (args .db )
@@ -111,10 +163,10 @@ def _get_case_and_control_samples(args):
111
163
return cases , controls
112
164
113
165
114
- def _calculate_c (m_n , p_0 ):
166
+ def _calculate_c (n_i , p_0 ):
115
167
c = 0.0
116
168
singleton_n = 0
117
- for n in m_n :
169
+ for n in n_i :
118
170
if n < 2 :
119
171
singleton_n += n
120
172
continue
@@ -145,7 +197,6 @@ def _calculate_T(m, p_0, n_i, y_i):
145
197
T += _variant_T_term (p_0 , singleton_n , singleton_y )
146
198
return T
147
199
148
-
149
200
def _variant_T_term (p_0 , n_i , y_i ):
150
201
return (y_i - n_i * p_0 )** 2 - n_i * p_0 * (1 - p_0 )
151
202
@@ -215,3 +266,20 @@ def burden(parser, args):
215
266
get_calpha (args )
216
267
else :
217
268
burden_by_gene (args )
269
+
270
+
271
+ # unit tests of the underlying calculations
272
+ def _test_calculate_C ():
273
+ nn = [4 , 10 , 5 ]
274
+ yy = [2 , 8 , 0 ]
275
+ correct = 15.250000000000007
276
+ calc = _calculate_c (nn , 0.5 )
277
+ assert correct == calc
278
+
279
+ def _test_calculate_T ():
280
+ nn = [4 , 10 , 5 ]
281
+ yy = [2 , 8 , 0 ]
282
+ correct = 10.5
283
+
284
+ calc = sum ([_variant_T_term (0.5 , n , y ) for n , y in zip (nn , yy )])
285
+ assert correct == calc
0 commit comments