Skip to content

Commit ae34670

Browse files
arq5xroryk
arq5x
authored andcommitted
Added permutation testing.
Also fixed the asymptotic p-value calculation which was reporting slightly higher p-values than expected.
1 parent b0f9873 commit ae34670

6 files changed

+98
-14
lines changed

MANIFEST.in

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
include *.txt
2+
include *.md
3+
include *.py
4+
include *.sh
5+
include LICENSE
6+
include MANIFEST.in
7+
graft gemini/annotation_provenance
8+
graft gemini/data
9+
graft gemini/scripts
10+
graft gemini/static
11+
graft gemini/views
12+
graft docs
13+
graft test

gemini/gemini_main.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -551,7 +551,11 @@ def main():
551551
action='store_true',
552552
default=False,
553553
help="Run the C-alpha association test.")
554-
554+
parser_burden.add_argument('--permutations',
555+
default=0,
556+
type=int,
557+
help=("Number of permutations to run for the "
558+
"C-alpha test (try 1000 to start)."))
555559
parser_burden.add_argument('--min-aaf',
556560
dest='min_aaf',
557561
type=float,

gemini/tool_burden_tests.py

+77-9
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
11
import math
22
from collections import Counter, defaultdict
33
import numpy as np
4-
from scipy.stats import binom, norm
4+
from scipy.stats import binom, norm, chi2
55
from pandas import DataFrame
66
import sys
7+
import random
8+
from math import pow
9+
from itertools import ifilterfalse, islice
10+
from scipy.misc import comb
11+
712

813
import GeminiQuery
914

@@ -59,7 +64,7 @@ def get_calpha(args):
5964
m = len(vig.keys())
6065

6166
# m_n is the number of variants with n copies (i.e., samples with the variant)
62-
m_n = Counter([len(x) for x in vig.values()])
67+
#m_n = Counter([len(x) for x in vig.values()])
6368

6469
# n_i is a list reflecting the total number of samples
6570
# having each variant
@@ -81,22 +86,69 @@ def get_calpha(args):
8186
T = _calculate_T(m, p_0, n_i, y_i)
8287

8388
# Calculate the variance of T in order to normalize it
84-
c = _calculate_c(m_n, p_0)
89+
c = _calculate_c(n_i, p_0)
8590

8691
# The final test statistic, Z, id just the original test statistic divided
8792
# by its standard deviation. "We reject the null when Z is larger than expected
8893
# using a one-tailed standard normal distribution for reference.
8994
if c == 0:
9095
Z = np.NaN
96+
p_value = np.NaN
97+
print "\t".join([gene, str(T), str(c), str(Z), str(p_value)])
98+
continue
9199
else:
92100
Z = T / math.sqrt(c)
93101

94-
# sf is the survival function ... same as 1 - CDF.
95-
p_value = norm.sf(Z)
96-
# alternatie p-value 1 - scipy.stats.chi2.cdf(T**2/c, 1)
102+
if args.permutations == 0:
103+
# sf is the survival function ... same as 1 - CDF.
104+
p_value = norm.sf(Z)
105+
else:
106+
# this permutes the cases without replacement, important for
107+
# calculating an exact p-value
108+
perms = permute_cases(samples, args.permutations, case)
109+
T_scores = []
110+
for perm_case in perms:
111+
y_i = [len(filter(lambda a: a in perm_case, x)) for x in vig.values()]
112+
T_permuted = _calculate_T(m, p_0, n_i, y_i)
113+
T_scores.append(T_permuted)
114+
false_hits = sum([x >= T for x in T_scores])
115+
# the + 1 to make it an unbiased estimator
116+
# Permutation P-values Should Never Be Zero: Calculating Exact
117+
# P-values When Permutations Are Randomly Drawn
118+
# http://www.degruyter.com/view/j/sagmb.2010.9.1/sagmb.2010.9.1.1585/sagmb.2010.9.1.1585.xml
119+
p_value = (float(false_hits) + 1) / (float(args.permutations + 1))
120+
97121
print "\t".join([gene, str(T), str(c), str(Z), str(p_value)])
98122

99123

124+
def permute_cases(samples, permutations, case):
125+
max_permutations = comb(len(samples), len(case))
126+
if permutations > max_permutations:
127+
sys.stderr.write("Permutations set to greater than the maximum number of "
128+
"unique permutations of cases labels. Setting it to "
129+
"%d\n." % (max_permutations))
130+
permutations = max_permutations
131+
132+
perms = take(permutations, unique_permutations(samples, len(case)))
133+
return perms
134+
135+
def unique_permutations(iterable, length):
136+
"""
137+
returns random permutations from an iterable without repeating a set
138+
take(unique_permutations([1,2,3,4,5], 2), 3) => [3,4], [1,6], [3,5]
139+
"""
140+
seen = set()
141+
while True:
142+
element = tuple(sorted(random.sample(iterable, length)))
143+
if element not in seen:
144+
seen.add(element)
145+
yield list(element)
146+
147+
def take(n, iterable):
148+
"Return first n items of the iterable as a list"
149+
return list(islice(iterable, n))
150+
151+
100152
def _get_case_and_control_samples(args):
101153
query = ("SELECT * from samples")
102154
gq = GeminiQuery.GeminiQuery(args.db)
@@ -111,10 +163,10 @@ def _get_case_and_control_samples(args):
111163
return cases, controls
112164

113165

114-
def _calculate_c(m_n, p_0):
166+
def _calculate_c(n_i, p_0):
115167
c = 0.0
116168
singleton_n = 0
117-
for n in m_n:
169+
for n in n_i:
118170
if n < 2:
119171
singleton_n += n
120172
continue
@@ -145,7 +197,6 @@ def _calculate_T(m, p_0, n_i, y_i):
145197
T += _variant_T_term(p_0, singleton_n, singleton_y)
146198
return T
147199

148-
149200
def _variant_T_term(p_0, n_i, y_i):
150201
return (y_i - n_i * p_0)**2 - n_i * p_0 * (1 - p_0)
151202

@@ -215,3 +266,20 @@ def burden(parser, args):
215266
get_calpha(args)
216267
else:
217268
burden_by_gene(args)
269+
270+
271+
# unit tests of the underlying calculations
272+
def _test_calculate_C():
273+
nn = [4, 10, 5]
274+
yy = [2, 8, 0]
275+
correct = 15.250000000000007
276+
calc = _calculate_c(nn, 0.5)
277+
assert correct == calc
278+
279+
def _test_calculate_T():
280+
nn = [4, 10, 5]
281+
yy = [2, 8, 0]
282+
correct = 10.5
283+
284+
calc = sum([_variant_T_term(0.5, n, y) for n, y in zip(nn, yy)])
285+
assert correct == calc

gemini/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__="0.6.2"
1+
__version__="0.6.3a"

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ ipython-cluster-helper>=0.1.9
1111
bx-python>=0.7.1
1212
pandas>=0.11.0
1313
scipy>=0.12.0
14-
git+https://github.com/arq5x/gemini.git
14+
gemini==0.6.2.1

setup.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,14 @@
3232
'bx-python >= 0.7.1',
3333
'pandas >= 0.11.0',
3434
'scipy >= 0.12.0'],
35-
dependency_links = ['http://github.com/arq5x/cyvcf/tarball/master#egg=cyvcf-0.1.5'],
3635
requires = ['python (>=2.5, <3.0)'],
3736
packages=['gemini',
3837
'gemini.scripts',
3938
'gemini.data'],
4039
author="Aaron Quinlan and Uma Paila",
4140
description='A database framework for exploring genetic variation',
4241
long_description=long_description,
43-
url="none",
42+
url="http://gemini.readthedocs.org",
4443
package_dir = {'gemini': "gemini"},
4544
package_data = {'gemini': ['data/gemini.conf']},
4645
zip_safe = False,

0 commit comments

Comments
 (0)