-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_enformer_feats.py
50 lines (44 loc) · 1.52 KB
/
get_enformer_feats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import tensorflow_hub as hub
import tensorflow as tf
import numpy as np
import os, sys
from pyfasta import Fasta
import math
import h5py
enformer_model = hub.load("https://tfhub.dev/deepmind/enformer/1").model
SEQ_LENGTH = 393216
interval = 896*128
chr_id = sys.argv[1]
start = int(sys.argv[2])
end = int(sys.argv[3])
parent = sys.argv[4]
fasta_path = sys.argv[5]
save_path =sys.argv[6]
#one-hot coding
def seq_to_mat(seq):
d = {'a':0, 'A':0, 'c':1, 'C':1, 'g':2, 'G':2, 't':3, 'T':3, 'N':4, 'n':4}
mat = np.zeros((5,len(seq)))
for i in range(len(seq)):
mat[d[seq[i]],i] = 1
mat = mat[:4,:]
return mat
def main():
center = (start+end) // 2
nb_regions = math.ceil((end-start-interval)/(2*interval))
genome = Fasta(fasta_path)
enformer_feats = []
for coor in range(center-interval*nb_regions, center+interval*(nb_regions+1),interval):
seq = genome['chr%s_%s'%(chr_id,parent)][(coor-SEQ_LENGTH//2):(coor+SEQ_LENGTH//2)]
onehot_mat = seq_to_mat(seq).T
onehot_mat = np.expand_dims(onehot_mat,0)
enformer_feats.append(enformer_model.predict_on_batch(onehot_mat)['human'])
enformer_feats = np.squeeze(np.stack(enformer_feats))
enformer_feats = enformer_feats.astype(np.float16)
# print(each, enformer_feats.shape)
f_out = h5py.File(save_path,'w')
_, nb_bins, _ = enformer_feats.shape
for i in range(nb_bins):
f_out.create_dataset('bin_%d'%i, data=enformer_feats[:,i,:], dtype='float32')
f_out.close()
if __name__=="__main__":
main()