Skip to content

Commit b598c16

Browse files
author
jhammelm
committed
init commit
0 parents  commit b598c16

12 files changed

+660
-0
lines changed

Diff for: CNN.py

+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import keras
2+
import numpy as np
3+
from keras.models import Model, load_model, Sequential
4+
from keras.layers import Conv1D, Dense, Reshape, Dropout, LSTM, GlobalMaxPooling1D, MaxPooling1D, Flatten, Input, Concatenate
5+
import tensorflow as tf
6+
from keras import optimizers
7+
8+
class CNN():
9+
def __init__(self,model_layers,seq_shape,out_shape=2,conv_filter_number=100,conv_filter_size=20):
10+
model=Sequential()
11+
for i, layer in enumerate(model_layers):
12+
if layer == 'conv':
13+
if i == 0:
14+
JASPAR_motifs = list(np.load('homer_matrix.npy'))
15+
filter_len = max([JASPAR_motifs[k].shape[0] for k in range(len(JASPAR_motifs))])
16+
conv_layer = Conv1D(input_shape=seq_shape,
17+
filters=len(JASPAR_motifs)*2,
18+
kernel_size=35,
19+
padding="valid",
20+
activation="relu",
21+
strides=1)
22+
model.add(conv_layer)
23+
conv_weights = conv_layer.get_weights()
24+
25+
reverse_motifs = [JASPAR_motifs[j][::-1,::-1] for j in range(len(JASPAR_motifs))]
26+
JASPAR_motifs = JASPAR_motifs + reverse_motifs
27+
28+
for j in range(len(JASPAR_motifs)):
29+
m = JASPAR_motifs[j][::-1,:]
30+
w = m.shape[0]
31+
conv_weights[0][:,:,j] = 0
32+
start = np.random.randint(low=3, high=35-w-3+1)
33+
conv_weights[0][start:(start+w),:,j] = m - 0.25
34+
conv_weights[1][j] = np.random.uniform(low=-1.0,high=0.0)
35+
36+
conv_layer.set_weights(conv_weights)
37+
conv_layer.trainable=False
38+
else:
39+
model.add(Conv1D(conv_filter_number,conv_filter_size,activation='relu',padding='same'))
40+
if layer == 'globalpool':
41+
model.add(GlobalMaxPooling1D())
42+
if layer == 'maxpool':
43+
model.add(MaxPooling1D(3,1))
44+
if layer == 'LSTM':
45+
model.add(LSTM(16,return_sequences=True))
46+
if layer == 'dense':
47+
model.add(Dropout(0.1))
48+
model.add(Dense(128,activation='relu'))
49+
50+
if 'dense' not in model_layers and 'globalpool' not in model_layers:
51+
model.add(Flatten())
52+
model.add(Dropout(0.1))
53+
model.add(Dense(out_shape,activation='sigmoid'))
54+
adam = optimizers.Adam(lr=1e-4,clipnorm=0.5,decay=(1e-4/100.0))
55+
model.compile(optimizer=adam,loss='binary_crossentropy',metrics=['accuracy'])
56+
self.model = model
57+
58+
def train(self,X,y,sample_weights):
59+
callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss',
60+
patience=3),
61+
keras.callbacks.History()]
62+
history = self.model.fit(x=X,
63+
y=y,epochs=100,
64+
shuffle=True,
65+
validation_split=0.2,
66+
batch_size=100,verbose=1,
67+
callbacks=callbacks,
68+
sample_weight = sample_weights)
69+
return history
70+
71+
def save(self,h5file):
72+
self.model.save(h5file)
73+
74+
def predict(self,X):
75+
return self.model.predict(X)
76+
77+
def error(self,X,y):
78+
return np.linalg.norm(y-self.model.predict(X))

Diff for: CNN_param.py

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import keras
2+
import numpy as np
3+
from keras.models import Model, load_model, Sequential
4+
from keras.layers import Conv1D, Dense, Reshape, Dropout, LSTM, GlobalMaxPooling1D, MaxPooling1D, Flatten, Input, Concatenate
5+
import tensorflow as tf
6+
from keras import optimizers
7+
8+
class CNN():
9+
def __init__(self,model_layers,seq_shape,
10+
conv_filter_number=100,
11+
conv_filter_size=20,
12+
learning_rate=1e-4,
13+
dense_size=128,
14+
dropout=0.1,
15+
batch_size=32):
16+
model=Sequential()
17+
for i, layer in enumerate(model_layers):
18+
if layer == 'conv':
19+
if i == 0:
20+
JASPAR_motifs = list(np.load('homer_matrix.npy'))
21+
filter_len = max([JASPAR_motifs[k].shape[0] for k in range(len(JASPAR_motifs))])
22+
conv_layer = Conv1D(input_shape=seq_shape,
23+
filters=len(JASPAR_motifs)*2,
24+
kernel_size=35,
25+
padding="valid",
26+
activation="relu",
27+
strides=1)
28+
model.add(conv_layer)
29+
conv_weights = conv_layer.get_weights()
30+
31+
reverse_motifs = [JASPAR_motifs[j][::-1,::-1] for j in range(len(JASPAR_motifs))]
32+
JASPAR_motifs = JASPAR_motifs + reverse_motifs
33+
34+
for j in range(len(JASPAR_motifs)):
35+
m = JASPAR_motifs[j][::-1,:]
36+
w = m.shape[0]
37+
conv_weights[0][:,:,j] = 0
38+
start = np.random.randint(low=3, high=35-w-3+1)
39+
conv_weights[0][start:(start+w),:,j] = m - 0.25
40+
conv_weights[1][j] = np.random.uniform(low=-1.0,high=0.0)
41+
42+
conv_layer.set_weights(conv_weights)
43+
conv_layer.trainable=False
44+
else:
45+
model.add(Conv1D(conv_filter_number,conv_filter_size,activation='relu',padding='same'))
46+
if layer == 'globalpool':
47+
model.add(GlobalMaxPooling1D())
48+
if layer == 'maxpool':
49+
model.add(MaxPooling1D(3,1))
50+
if layer == 'LSTM':
51+
model.add(LSTM(16,return_sequences=True))
52+
if layer == 'dense':
53+
model.add(Dropout(dropout))
54+
model.add(Dense(dense_size,activation='relu'))
55+
56+
model.add(Dropout(dropout))
57+
model.add(Dense(2,activation='sigmoid'))
58+
adam = optimizers.Adam(lr=learning_rate)
59+
model.compile(optimizer=adam,loss='binary_crossentropy',metrics=['accuracy'])
60+
self.model = model
61+
self.batch_size=batch_size
62+
63+
def train(self,X,y,sample_weights):
64+
callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss',
65+
patience=3),
66+
keras.callbacks.History()]
67+
history = self.model.fit(x=X,
68+
y=y,epochs=100,
69+
shuffle=True,
70+
validation_split=0.2,
71+
batch_size=self.batch_size,
72+
verbose=1,
73+
callbacks=callbacks,
74+
sample_weight = sample_weights)
75+
return history
76+
77+
def save(self,h5file):
78+
self.model.save(h5file)
79+
80+
def predict(self,X):
81+
return self.model.predict(X)
82+
83+
def error(self,X,y):
84+
return np.linalg.norm(y-self.model.predict(X))

Diff for: README.md

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# ensemble-cnn

Diff for: __pycache__/CNN.cpython-37.pyc

3.28 KB
Binary file not shown.

Diff for: __pycache__/ensemble_utils.cpython-37.pyc

1.47 KB
Binary file not shown.

Diff for: ensemble_utils.py

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import os
2+
import numpy as np
3+
4+
def ensure_dir(file_path):
5+
#directory = os.path.dirname(file_path)
6+
if not os.path.exists(file_path):
7+
os.makedirs(file_path)
8+
9+
def act_to_class(act):
10+
y = []
11+
header = True
12+
for line in open(act):
13+
if header:
14+
header = False
15+
continue
16+
data = line.strip().split()
17+
y.append([int(d) for d in data[1:]])
18+
return np.array(y)
19+
20+
def fa_to_onehot(fa):
21+
alpha = ['A','C','G','T']
22+
sequences = open(fa).read().split(">")[1:]
23+
seqdict = [seq.strip().split("\n")[1] for seq in sequences]
24+
seq_mat = []
25+
slen = max([len(seq) for seq in seqdict])
26+
for i,seqc in enumerate(seqdict):
27+
seq = np.zeros((slen,4))
28+
for j,c in enumerate(seqc.upper()):
29+
if c not in alpha:
30+
seq[j,:] = 0.25
31+
else:
32+
aind = alpha.index(c)
33+
seq[j,aind] = 1
34+
seq_mat.append(seq)
35+
return np.array(seq_mat)

Diff for: extract_cluster_sequences.py

+89
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#!/bin/env python
2+
import numpy as np
3+
import pickle
4+
import argparse
5+
from scipy.stats import norm
6+
from sklearn.cluster import AffinityPropagation
7+
from sklearn.metrics import silhouette_score
8+
from sklearn import metrics
9+
10+
parser = argparse.ArgumentParser()
11+
parser.add_argument('fasta')
12+
parser.add_argument('importance')
13+
parser.add_argument('-n','--ntop',default=25,type=int)
14+
parser.add_argument('-k','--kmer',default=10,type=int)
15+
parser.add_argument('-p','--pval',default=0.01,type=float,help='FDR corrected pval threshold')
16+
opts = parser.parse_args()
17+
18+
seqs = [l.strip().split()[1] for l in open(opts.fasta).read().split(">")[1:]]
19+
with open(opts.importance,'rb') as f:
20+
mat = pickle.load(f)
21+
22+
windows = np.array([np.mean(mat[i,j:(j+opts.kmer),:]) for i in range(len(seqs)) for j in range(len(seqs[0])-opts.kmer)])
23+
24+
mu_window = np.mean(windows)
25+
sigma_window = np.std(windows)
26+
significant_seqs = {}
27+
significant_pvals = {}
28+
nhypothesis = windows.shape[0]
29+
for i in range(len(seqs)):
30+
for j in range(len(seqs[0])-opts.kmer):
31+
score=np.mean(mat[i,j:(j+opts.kmer)])
32+
pval = norm.sf(score,
33+
loc=mu_window,scale=sigma_window)
34+
if pval < opts.pval/(nhypothesis):
35+
try:
36+
key = significant_seqs[seqs[i][j:j+opts.kmer].upper()]
37+
old_pval = significant_pvals[seqs[i][j:j+opts.kmer].upper()]
38+
pval = min(pval,old_pval)
39+
except KeyError:
40+
significant_seqs[seqs[i][j:j+opts.kmer].upper()] = np.zeros((opts.kmer,4))
41+
significant_pvals[seqs[i][j:j+opts.kmer].upper()] = pval
42+
significant_seqs[seqs[i][j:j+opts.kmer].upper()] += mat[i,j:j+opts.kmer,:]
43+
44+
print "# num significant:",len(significant_pvals)
45+
46+
from Bio import pairwise2
47+
subseqs = significant_pvals.keys()
48+
affinity = np.zeros((len(subseqs),len(subseqs)))
49+
for i in range(len(subseqs)-1):
50+
for j in range(i,len(subseqs)):
51+
aln = pairwise2.align.localms(subseqs[i],subseqs[j], 2, -1, -3, -1)
52+
if len(aln) > 0:
53+
score = aln[0][2]
54+
else:
55+
score = 0.01
56+
affinity[i,j] = max(score,0.01)
57+
affinity[j,i] = max(score,0.01)
58+
bestdf=0.8
59+
bestscore=None
60+
sig_subtract=None
61+
for ss in np.linspace(10,100,10):
62+
significance=[-np.log10(1.0/1000000+ significant_pvals[k])-ss for k in subseqs]
63+
labels = AffinityPropagation(damping=bestdf,affinity='precomputed',preference=significance).fit_predict(affinity)
64+
score = silhouette_score(affinity, labels, metric='precomputed')
65+
if bestscore != None and score > bestscore:
66+
bestscore=score
67+
sig_subtract=ss
68+
elif bestscore == None:
69+
bestscore=score
70+
sig_subtract = ss
71+
72+
significance=[-np.log10(1.0/1000000+ significant_pvals[k])-sig_subtract for k in subseqs]
73+
aclust = AffinityPropagation(damping=bestdf,affinity='precomputed',preference=significance).fit(affinity)
74+
representatives = [subseqs[i] for i in aclust.cluster_centers_indices_]
75+
print "# affinity clustering damping:",bestdf
76+
print "# affinity preference:",sig_subtract
77+
print "# num clusters:",len(representatives)
78+
print "# silhouette coefficient:",bestscore
79+
sorted_significant = sorted([(k,significant_pvals[k]) for k in representatives],key=lambda kv:kv[1])
80+
for key,pval in sorted_significant:
81+
#pval = significant_pvals[key]
82+
score_mat = significant_seqs[key]
83+
print ">",key+"\t"+str(round(pval,5))+"\t"+str(round(np.mean(score_mat),5))
84+
for j in range(opts.kmer):
85+
for n in range(4):
86+
print max(np.round(score_mat[j,n]*100+1),0),
87+
print
88+
89+

Diff for: extract_importance_ensemble.py

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/bin/env python
2+
import os
3+
import numpy as np
4+
import argparse
5+
from ensemble_utils import *
6+
from CNN import *
7+
import argparse
8+
import keras
9+
import pickle
10+
from tensorflow.python.client import device_lib
11+
import os
12+
import tensorflow as tf
13+
from keras.backend.tensorflow_backend import set_session
14+
from keras import activations
15+
from importance_utils import *
16+
17+
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
18+
os.environ["CUDA_VISIBLE_DEVICES"] = "5"
19+
20+
config = tf.ConfigProto()
21+
config.gpu_options.per_process_gpu_memory_fraction = 0.4
22+
config.gpu_options.allow_growth = True # dynamically grow the memory used on the GPU
23+
config.allow_soft_placement = True
24+
sess = tf.Session(config=config)
25+
set_session(sess)
26+
27+
print(device_lib.list_local_devices())
28+
parser = argparse.ArgumentParser()
29+
parser.add_argument('testfasta')
30+
parser.add_argument('model',help="model folder")
31+
parser.add_argument('ioutfile')
32+
parser.add_argument('outfile')
33+
opts=parser.parse_args()
34+
35+
X = fa_to_onehot(opts.testfasta)
36+
model_folders = [opts.model+"/"+d for d in os.listdir(opts.model) if os.path.isdir(opts.model+"/"+d)]
37+
with open(opts.model+"/model_acc.pkl","rb") as f:
38+
accuracies = pickle.load(f)
39+
total_grads_ed = []
40+
total_grads_es = []
41+
for mi,model in enumerate(model_folders):
42+
print(model)
43+
grads_ed = saliency(0,model+"/model.h5",0,X,30)*X
44+
grads_es = saliency(0,model+"/model.h5",1,X,30)*X
45+
# grads are a X size matrix with importance scores for each
46+
# sequence, for each position in the sequence
47+
total_grads_ed.append(grads_ed)
48+
total_grads_es.append(grads_es)
49+
with open(model+"/"+opts.ioutfile+'_tp2.pkl', 'wb') as handle:
50+
pickle.dump(grads_ed, handle, protocol=2)
51+
with open(model+"/"+opts.ioutfile+'_tp1.pkl', 'wb') as handle:
52+
pickle.dump(grads_es, handle, protocol=2)
53+
54+
saliency_ed = np.zeros(total_grads_ed[0].shape)
55+
56+
for mi,model in enumerate(model_folders):
57+
saliency_ed += accuracies[model]*total_grads_ed[mi]
58+
saliency_ed = saliency_ed/sum(accuracies.values())
59+
with open(opts.outfile+'_tp1.pkl','wb') as handle:
60+
pickle.dump(saliency_ed,handle,protocol=2)
61+
62+
saliency_es = np.zeros(total_grads_es[0].shape)
63+
for mi,model in enumerate(model_folders):
64+
saliency_es += accuracies[model]*total_grads_es[mi]
65+
saliency_es = saliency_es/sum(accuracies.values())
66+
with open(opts.outfile+'_tp2.pkl','wb') as handle:
67+
pickle.dump(saliency_es,handle,protocol=2)
68+
69+

0 commit comments

Comments
 (0)