Added permutation testing.

arq5x · roryk · commit ae3467018602 · 2013-10-10T16:10:46.000-04:00
Also fixed the asymptotic p-value calculation which was reporting
slightly higher p-values than expected.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,13 @@
+include *.txt
+include *.md
+include *.py
+include *.sh
+include LICENSE
+include MANIFEST.in
+graft gemini/annotation_provenance
+graft gemini/data
+graft gemini/scripts
+graft gemini/static
+graft gemini/views
+graft docs
+graft test
diff --git a/gemini/gemini_main.py b/gemini/gemini_main.py
@@ -551,7 +551,11 @@ def main():
                                action='store_true',
                                default=False,
                                help="Run the C-alpha association test.")
-
+    parser_burden.add_argument('--permutations',
+                               default=0,
+                               type=int,
+                               help=("Number of permutations to run for the "
+                                     "C-alpha test (try 1000 to start)."))
     parser_burden.add_argument('--min-aaf',
                                dest='min_aaf',
                                type=float,
diff --git a/gemini/tool_burden_tests.py b/gemini/tool_burden_tests.py
@@ -1,9 +1,14 @@
 import math
 from collections import Counter, defaultdict
 import numpy as np
-from scipy.stats import binom, norm
+from scipy.stats import binom, norm, chi2
 from pandas import DataFrame
 import sys
+import random
+from math import pow
+from itertools import ifilterfalse, islice
+from scipy.misc import comb
+
 
 import GeminiQuery
 
@@ -59,7 +64,7 @@ def get_calpha(args):
         m = len(vig.keys())
 
         # m_n is the number of variants with n copies (i.e., samples with the variant)
-        m_n = Counter([len(x) for x in vig.values()])
+        #m_n = Counter([len(x) for x in vig.values()])
 
         # n_i is a list reflecting the total number of samples
         # having each variant
@@ -81,22 +86,69 @@ def get_calpha(args):
         T = _calculate_T(m, p_0, n_i, y_i)
 
         # Calculate the variance of T in order to normalize it
-        c = _calculate_c(m_n, p_0)
+        c = _calculate_c(n_i, p_0)
 
         # The final test statistic, Z, id just the original test statistic divided
         # by its standard deviation. "We reject the null when Z is larger than expected
         # using a one-tailed standard normal distribution for reference.
         if c == 0:
             Z = np.NaN
+            p_value = np.NaN
+            print "\t".join([gene, str(T), str(c), str(Z), str(p_value)])
+            continue
         else:
             Z = T / math.sqrt(c)
 
-        # sf is the survival function ... same as 1 - CDF.
-        p_value = norm.sf(Z)
-        # alternatie p-value 1 - scipy.stats.chi2.cdf(T**2/c, 1)
+        if args.permutations == 0:
+            # sf is the survival function ... same as 1 - CDF.
+            p_value = norm.sf(Z)
+        else:
+            # this permutes the cases without replacement, important for
+            # calculating an exact p-value
+            perms = permute_cases(samples, args.permutations, case)
+            T_scores = []
+            for perm_case in perms:
+                y_i = [len(filter(lambda a: a in perm_case, x)) for x in vig.values()]
+                T_permuted = _calculate_T(m, p_0, n_i, y_i)
+                T_scores.append(T_permuted)
+            false_hits = sum([x >= T for x in T_scores])
+            # the + 1 to make it an unbiased estimator
+            # Permutation P-values Should Never Be Zero: Calculating Exact
+            # P-values When Permutations Are Randomly Drawn
+            # http://www.degruyter.com/view/j/sagmb.2010.9.1/sagmb.2010.9.1.1585/sagmb.2010.9.1.1585.xml
+            p_value = (float(false_hits) + 1) / (float(args.permutations + 1))
+
         print "\t".join([gene, str(T), str(c), str(Z), str(p_value)])
 
 
+def permute_cases(samples, permutations, case):
+    max_permutations = comb(len(samples), len(case))
+    if permutations > max_permutations:
+        sys.stderr.write("Permutations set to greater than the maximum number of "
+                         "unique permutations of cases labels. Setting it to "
+                         "%d\n." % (max_permutations))
+        permutations = max_permutations
+
+    perms = take(permutations, unique_permutations(samples, len(case)))
+    return perms
+
+def unique_permutations(iterable, length):
+    """
+    returns random permutations from an iterable without repeating a set
+    take(unique_permutations([1,2,3,4,5], 2), 3) => [3,4], [1,6], [3,5]
+    """
+    seen = set()
+    while True:
+        element = tuple(sorted(random.sample(iterable, length)))
+        if element not in seen:
+            seen.add(element)
+            yield list(element)
+
+def take(n, iterable):
+    "Return first n items of the iterable as a list"
+    return list(islice(iterable, n))
+
+
 def _get_case_and_control_samples(args):
     query = ("SELECT * from samples")
     gq = GeminiQuery.GeminiQuery(args.db)
@@ -111,10 +163,10 @@ def _get_case_and_control_samples(args):
     return cases, controls
 
 
-def _calculate_c(m_n, p_0):
+def _calculate_c(n_i, p_0):
     c = 0.0
     singleton_n = 0
-    for n in m_n:
+    for n in n_i:
         if n < 2:
             singleton_n += n
             continue
@@ -145,7 +197,6 @@ def _calculate_T(m, p_0, n_i, y_i):
         T += _variant_T_term(p_0, singleton_n, singleton_y)
     return T
 
-
 def _variant_T_term(p_0, n_i, y_i):
     return (y_i - n_i * p_0)**2 - n_i * p_0 * (1 - p_0)
 
@@ -215,3 +266,20 @@ def burden(parser, args):
         get_calpha(args)
     else:
         burden_by_gene(args)
+
+
+# unit tests of the underlying calculations
+def _test_calculate_C():
+    nn = [4, 10, 5]
+    yy = [2, 8, 0]
+    correct = 15.250000000000007
+    calc  = _calculate_c(nn, 0.5)
+    assert correct == calc
+
+def _test_calculate_T():
+    nn = [4, 10, 5]
+    yy = [2, 8, 0]
+    correct = 10.5
+
+    calc = sum([_variant_T_term(0.5, n, y) for n, y in zip(nn, yy)])
+    assert correct == calc
diff --git a/gemini/version.py b/gemini/version.py
@@ -1 +1 @@
-__version__="0.6.2"
+__version__="0.6.3a"
diff --git a/requirements.txt b/requirements.txt
@@ -11,4 +11,4 @@ ipython-cluster-helper>=0.1.9
 bx-python>=0.7.1
 pandas>=0.11.0
 scipy>=0.12.0
-git+https://github.com/arq5x/gemini.git
+gemini==0.6.2.1
diff --git a/setup.py b/setup.py
@@ -32,15 +32,14 @@
                           'bx-python >= 0.7.1',
                           'pandas >= 0.11.0',
                           'scipy >= 0.12.0'],
-        dependency_links = ['http://github.com/arq5x/cyvcf/tarball/master#egg=cyvcf-0.1.5'],
         requires = ['python (>=2.5, <3.0)'],
         packages=['gemini',
                   'gemini.scripts',
                   'gemini.data'],
         author="Aaron Quinlan and Uma Paila",
         description='A database framework for exploring genetic variation',
         long_description=long_description,
-        url="none",
+        url="http://gemini.readthedocs.org",
         package_dir = {'gemini': "gemini"},
         package_data = {'gemini': ['data/gemini.conf']},
         zip_safe = False,

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__="0.6.2"`
	`1`	`+__version__="0.6.3a"`