Skip to content

Commit 7a0fcb3

Browse files
committed
scripts
1 parent 679bc06 commit 7a0fcb3

File tree

1 file changed

+285
-0
lines changed

1 file changed

+285
-0
lines changed

todo/calc_SFS_jSFS.py

+285
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
Created on Fri Jul 13 13:43:26 2018
5+
6+
@author: scott
7+
"""
8+
9+
from __future__ import division
10+
from __future__ import print_function
11+
import allel
12+
import numpy as np
13+
import pandas as pd
14+
from allel_class import Chr
15+
import autil as autil
16+
from itertools import combinations
17+
import matplotlib.pyplot as plt
18+
import seaborn as sns
19+
import argparse
20+
sns.set_style('white')
21+
sns.set_style('ticks')
22+
23+
parser = argparse.ArgumentParser()
24+
parser.add_argument('-v', "--vcfFile", help="path to vcf")
25+
parser.add_argument('--h5', action="store_true", help="h5 exists")
26+
parser.add_argument('-m', "--meta", required=True, help="path to meta data")
27+
args = parser.parse_args()
28+
29+
30+
def makeh5fromvcf(vcfin, altnum, hf5):
31+
"""
32+
"""
33+
h5out = "{}.h5".format(vcfin)
34+
if hf5:
35+
pass
36+
else:
37+
fieldsfromvcf = ['samples', 'calldata/GQ', 'variants/ALT',
38+
'variants/REF', 'variants/QUAL', 'variants/CHROM',
39+
'variants/POS', 'variants/AF', 'variants/AB',
40+
'variants/MQM', 'variants/DP', 'calldata/DP',
41+
'calldata/AD', 'calldata/GT']
42+
allel.vcf_to_hdf5(vcfin, h5out, fields=fieldsfromvcf,
43+
types={'calldata/GQ': 'float32'}, alt_number=2)
44+
# callset = h5py.File(h5out, mode='r')
45+
return(None)
46+
47+
48+
def asfsStatsSeg(gt, pops, chrm, rand=True, plot=False):
49+
"""Aggregate SFS, singletons and doubletons
50+
"""
51+
print("asfs")
52+
aSFS1 = []
53+
aSFS2 = []
54+
for p in pops:
55+
gtpop = gt.take(p, axis=1)
56+
acpop = gtpop.count_alleles()
57+
seg = acpop.is_segregating()
58+
gtseg = gtpop.compress(seg)
59+
# random snps
60+
if rand:
61+
n = 100000 # number of SNPs to choose randomly
62+
try:
63+
vidx = np.random.choice(gtseg.shape[0], n, replace=False)
64+
except ValueError:
65+
vidx = np.random.choice(gtseg.shape[0], gtseg.shape[0], replace=False)
66+
else:
67+
vidx = np.random.choice(gtseg.shape[0], gtseg.shape[0], replace=False)
68+
vidx.sort()
69+
gtp = gtseg.take(vidx, axis=0)
70+
sfsp = (allel.sfs(gtp.count_alleles()[:, 1]))
71+
print(sfsp)
72+
if plot:
73+
fig, ax = plt.subplots(figsize=(6, 6))
74+
allel.stats.plot_sfs(sfsp, ax=ax)
75+
tots = np.sum(sfsp)
76+
aSFS1.append(sfsp[1]/tots)
77+
aSFS2.append(sfsp[2]/tots)
78+
return(aSFS1, aSFS2)
79+
80+
81+
def jsfsStatsSeg(gt, pops, chrm, fold=False, rand=True, plot=False):
82+
"""Joint site frequency spectrum with scikit-allel
83+
"""
84+
print("jsfs")
85+
jsfslist = []
86+
for i, j in combinations(pops, 2):
87+
gtpops = gt.take(i+j, axis=1)
88+
acpops = gtpops.count_alleles()
89+
seg = acpops.is_segregating()
90+
gtseg = gt.compress(seg)
91+
# random snps
92+
if rand:
93+
n = 100000 # number of SNPs to choose randomly
94+
try:
95+
vidx = np.random.choice(gtseg.shape[0], n, replace=False)
96+
except ValueError:
97+
vidx = np.random.choice(gtseg.shape[0], gtseg.shape[0], replace=False)
98+
else:
99+
vidx = np.random.choice(gtseg.shape[0], gtseg.shape[0], replace=False)
100+
vidx.sort()
101+
gtr = gtseg.take(vidx, axis=0)
102+
gtpop1 = gtr.take(i, axis=1)
103+
gtpop2 = gtr.take(j, axis=1)
104+
ac1 = gtpop1.count_alleles()
105+
ac2 = gtpop2.count_alleles()
106+
if fold:
107+
# pad for allel as well
108+
popsizeA, popsizeB = len(i)/2, len(j)/2
109+
fs = np.zeros((popsizeA + 1, popsizeB + 1), dtype=int)
110+
jsfs = allel.joint_sfs_folded(ac1, ac2)
111+
fs[:jsfs.shape[0], :jsfs.shape[1]] = jsfs
112+
else:
113+
# pad for allel as well
114+
popsizeA, popsizeB = len(i)*2, len(j)*2
115+
fs = np.zeros((popsizeA + 1, popsizeB + 1), dtype=int)
116+
jsfs = allel.joint_sfs(ac1[:, 1], ac2[:, 1])
117+
fs[:jsfs.shape[0], :jsfs.shape[1]] = jsfs
118+
if plot:
119+
fig, ax = plt.subplots(figsize=(6, 6))
120+
allel.stats.plot_joint_sfs(fs, ax=ax)
121+
jsfsarray = np.zeros(23)
122+
jsfsarray[0] = np.sum(fs[0, 1:3])
123+
jsfsarray[1] = np.sum(fs[1:3, 0])
124+
jsfsarray[2] = np.sum(fs[0, 3:-3])
125+
jsfsarray[3] = np.sum(fs[3:-3, 0])
126+
jsfsarray[4] = np.sum(fs[0, -3:-1])
127+
jsfsarray[5] = np.sum(fs[-3:-1, 0])
128+
jsfsarray[6] = np.sum(fs[1:3, 1:3])
129+
jsfsarray[7] = np.sum(fs[1:3, 3:-3])
130+
jsfsarray[8] = np.sum(fs[3:-3, 1:3])
131+
jsfsarray[9] = np.sum(fs[-3:-1, 3:-3])
132+
jsfsarray[10] = np.sum(fs[3:-3, -3:-1])
133+
jsfsarray[11] = np.sum(fs[1:3, -3:-1])
134+
jsfsarray[12] = np.sum(fs[-3:-1, 1:3])
135+
jsfsarray[13] = np.sum(fs[3:-3, 3:-3])
136+
jsfsarray[14] = np.sum(fs[-3:-1, -3:-1])
137+
jsfsarray[15] = np.sum(fs[0, -1])
138+
jsfsarray[16] = np.sum(fs[-1, 0])
139+
jsfsarray[17] = np.sum(fs[-1, 1:3])
140+
jsfsarray[18] = np.sum(fs[1:3, -1])
141+
jsfsarray[19] = np.sum(fs[-1, 3:-3])
142+
jsfsarray[20] = np.sum(fs[3:-3, -1])
143+
jsfsarray[21] = np.sum(fs[-1, -3:-1])
144+
jsfsarray[22] = np.sum(fs[-3:-1, -1])
145+
jsfslist.append(jsfsarray)
146+
return(jsfslist)
147+
148+
149+
def jsfsStats(gt, pops, chrm, fold=False, plot=False):
150+
"""Joint site frequency spectrum with scikit-allel
151+
"""
152+
print("jsfs")
153+
n = 100000 # number of SNPs to choose randomly
154+
try:
155+
vidx = np.random.choice(gt.shape[0], n, replace=False)
156+
except ValueError:
157+
vidx = np.random.choice(gt.shape[0], gt.shape[0], replace=False)
158+
vidx.sort()
159+
gtr = gt.take(vidx, axis=0)
160+
jsfslist = []
161+
for i, j in combinations(pops, 2):
162+
gtpop1 = gtr.take(i, axis=1)
163+
gtpop2 = gtr.take(j, axis=1)
164+
ac1 = gtpop1.count_alleles()
165+
ac2 = gtpop2.count_alleles()
166+
if fold:
167+
# pad for allel as well
168+
popsizeA, popsizeB = len(i)/2, len(j)/2
169+
fs = np.zeros((popsizeA + 1, popsizeB + 1), dtype=int)
170+
jsfs = allel.joint_sfs_folded(ac1, ac2)
171+
fs[:jsfs.shape[0], :jsfs.shape[1]] = jsfs
172+
else:
173+
# pad for allel as well
174+
popsizeA, popsizeB = len(i)*2, len(j)*2
175+
fs = np.zeros((popsizeA + 1, popsizeB + 1), dtype=int)
176+
jsfs = allel.joint_sfs(ac1[:, 1], ac2[:, 1])
177+
fs[:jsfs.shape[0], :jsfs.shape[1]] = jsfs
178+
if plot:
179+
fig, ax = plt.subplots(figsize=(6, 6))
180+
allel.stats.plot_joint_sfs(fs, ax=ax)
181+
jsfsarray = np.zeros(23)
182+
jsfsarray[0] = np.sum(fs[0, 1:3])
183+
jsfsarray[1] = np.sum(fs[1:3, 0])
184+
jsfsarray[2] = np.sum(fs[0, 3:-3])
185+
jsfsarray[3] = np.sum(fs[3:-3, 0])
186+
jsfsarray[4] = np.sum(fs[0, -3:-1])
187+
jsfsarray[5] = np.sum(fs[-3:-1, 0])
188+
jsfsarray[6] = np.sum(fs[1:3, 1:3])
189+
jsfsarray[7] = np.sum(fs[1:3, 3:-3])
190+
jsfsarray[8] = np.sum(fs[3:-3, 1:3])
191+
jsfsarray[9] = np.sum(fs[-3:-1, 3:-3])
192+
jsfsarray[10] = np.sum(fs[3:-3, -3:-1])
193+
jsfsarray[11] = np.sum(fs[1:3, -3:-1])
194+
jsfsarray[12] = np.sum(fs[-3:-1, 1:3])
195+
jsfsarray[13] = np.sum(fs[3:-3, 3:-3])
196+
jsfsarray[14] = np.sum(fs[-3:-1, -3:-1])
197+
jsfsarray[15] = np.sum(fs[0, -1])
198+
jsfsarray[16] = np.sum(fs[-1, 0])
199+
jsfsarray[17] = np.sum(fs[-1, 1:3])
200+
jsfsarray[18] = np.sum(fs[1:3, -1])
201+
jsfsarray[19] = np.sum(fs[-1, 3:-3])
202+
jsfsarray[20] = np.sum(fs[3:-3, -1])
203+
jsfsarray[21] = np.sum(fs[-1, -3:-1])
204+
jsfsarray[22] = np.sum(fs[-3:-1, -1])
205+
jsfslist.append(jsfsarray)
206+
return(jsfslist)
207+
208+
209+
def asfsStats(gt, pops, chrm, rand=True, plot=False):
210+
"""Aggregate SFS, singletons and doubletons
211+
"""
212+
print("asfs")
213+
if rand:
214+
n = 100000 # number of SNPs to choose randomly
215+
try:
216+
vidx = np.random.choice(gt.shape[0], n, replace=False)
217+
except ValueError:
218+
vidx = np.random.choice(gt.shape[0], gt.shape[0], replace=False)
219+
vidx.sort()
220+
gtr = gt.take(vidx, axis=0)
221+
else:
222+
gtr = gt
223+
aSFS1 = []
224+
aSFS2 = []
225+
for p in pops:
226+
gtp = gtr.take(p, axis=1)
227+
sfsp = (allel.sfs(gtp.count_alleles()[:, 1]))
228+
print(c)
229+
print(sfsp)
230+
print(np.sum(sfsp))
231+
if plot:
232+
fig, ax = plt.subplots(figsize=(6, 6))
233+
allel.stats.plot_sfs(sfsp, ax=ax)
234+
tots = np.sum(sfsp)
235+
aSFS1.append(sfsp[1]/tots)
236+
aSFS2.append(sfsp[2]/tots)
237+
return(aSFS1, aSFS2)
238+
239+
240+
if __name__ == "__main__":
241+
makeh5fromvcf(args.vcfFile, 1)
242+
meta = args.meta
243+
meta = pd.read_csv(meta, delimiter=",")
244+
var = Chr('All', "{}.h5".format(args.vcfFile))
245+
popdict = autil.subpops(var, meta, bypop=True, bykary=False)
246+
pop2color = autil.popcols(popdict)
247+
chrlist = np.unique(var.chrm[:])
248+
pops = list(popdict.values())
249+
sfsdict = {}
250+
jsfsdict = {}
251+
for c in chrlist:
252+
var.geno(c, meta)
253+
#sfsdict[c] = asfsStatsSeg(var.gt, pops, c, rand=False, plot=False)
254+
sfsdict[c] = asfsStats(var.gt, pops, c, rand=False, plot=False)
255+
#jsfsdict[c] = jsfsStatsSeg(var.gt, pops, c, fold=False, rand=False, plot=True)
256+
#jsfsdict[c] = jsfsStats(var.gt, pops, c)
257+
258+
# asfs
259+
s1 = []
260+
s2 = []
261+
for chrm in sfsdict.keys():
262+
s1.append(sfsdict[chrm][0])
263+
s2.append(sfsdict[chrm][1])
264+
s1array = np.mean(np.vstack(s1), axis=0)
265+
s2array = np.mean(np.vstack(s2), axis=0)
266+
267+
# jsfs
268+
props = []
269+
for chrm in jsfsdict.keys():
270+
jsfslist = jsfsdict[chrm]
271+
jsfstotal = np.sum(jsfslist, axis=1)
272+
props.append([j/jsfstotal[i] for i, j in enumerate(jsfslist)])
273+
jsfs = []
274+
for pairs in range(len(props[0])):
275+
p = []
276+
for chrm in props:
277+
p.append(chrm[pairs])
278+
jsfs.append(np.mean(np.vstack(p), axis=0))
279+
# write out
280+
s1 = " ".join(map(str, list(s1array)))
281+
s2 = " ".join(map(str, list(s2array)))
282+
j23 = " ".join(map(str, np.concatenate(jsfs).ravel()))
283+
f = open("Observed_summStats.out", 'w')
284+
f.write("{} {} {}\n".format(s1, s2, j23))
285+
f.close()

0 commit comments

Comments
 (0)