-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpset2_impute_my_data.py
61 lines (48 loc) · 1.67 KB
/
pset2_impute_my_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python
"""
CSE284 - Personal Genomics for Bioinformaticians
Problem Set 2 - Ancestry: Imputation
Example usage:
./pset2_impute.py \
ps2_impute.combined \
ps2_impute.heldout.gen.gz \
1000GP_Phase3_chr16.legend.gz
This script outputs Pearson r2 comparing true vs. imputed genotypes
"""
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
import sys
try:
imputefile = sys.argv[1]
truthfile = sys.argv[2]
legendfile = sys.argv[3]
except:
sys.stderr.write(__doc__+"\n")
sys.exit(1)
# Load imputation results
impres = pd.read_csv(imputefile, sep="\t")
impres.columns = ["chr", "position", "ref1", "ref2", "gt"]
impres['position'] = impres['position'].astype(np.int64)
# Load truth
truth = pd.read_csv(truthfile, sep=" ", names=["chrom","rsid","position","ref","alt","truth_00", "truth_01", "truth_11"])
# Merge
impres = pd.merge(impres, truth, on=["position"])
# Load legend
legend = pd.read_csv(legendfile, sep=" ")
# merge
data = pd.merge(impres, legend, on=["position"])
# Annotate best gt and score for each panel
def GetGenotype(x00, x01, x11):
genotypes = [0, 1, 2]
scores = [x00, x01, x11]
ind = scores.index(max(scores))
return genotypes[ind]
# Get r2 for each reference panel
data["truth_gt"] = data.apply(lambda x: GetGenotype(x["truth_00"], x["truth_01"], x["truth_11"]), 1)
print(" ".join(["YRI", str(pearsonr(data["truth_gt"], data["gt"])[0]**2), str(data.shape[0])])+"\n")
# Get r2 for different MAF thresholds
thresh = [0, 0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
for t in thresh:
x = data[data["ALL"] <= t]
print(" ".join([str(t), str(pearsonr(x["truth_gt"], x["gt"])[0]**2)])+"\n")