-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathcombination_model_train.py
126 lines (92 loc) · 4.63 KB
/
combination_model_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 28 17:47:20 2020
@author: akanksha
"""
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier
from itertools import combinations
import sklearn
from functools import reduce
import os
import argparse
import warnings
warnings.filterwarnings("ignore")
parser = argparse.ArgumentParser(description = 'train and save the combined models')
parser.add_argument('--deepsignalfilepath','-d', type = str, required = False,
help = 'Path to deepsignal output tsv file in the format [ID, Pos,Strand,Label,Score]. Can be compressed in gz.')
parser.add_argument('--nanopolishfilepath','-n', type = str, required = False,
help = 'Path to nanopolish output tsv file in the format [ID, Pos, Strand,Label,Score]. Can be compressed in gz.')
parser.add_argument('--guppyfilepath','-g', type = str, required = False,
help = 'Path to guppy output tsv file in the format [ID, Pos,Strand,Label,Score]. Can be compressed in gz.')
parser.add_argument('--megalodonfilepath','-m', type = str, required = False,
help = 'Path to megalodon output tsv file in the format [ID, Pos, Strand,Label,Score]. Can be compressed in gz.')
parser.add_argument('--tombofilepath','-t', type = str, required = False,
help = 'Path to tombo output tsv file in the format [ID, Pos,Strand,Label,Score]. Can be compressed in gz.')
parser.add_argument('--combinations','-c', choices=range(2, 6),required=True, type=int,
help = 'number of methods to combine together for training')
parser.add_argument('--output', '-o',type = str, required = True,
help = 'Where to save the trained model')
options = parser.parse_args()
def mod_file(data_file_path):
data_file=pd.read_csv(data_file_path, header=0, sep="\t")
name=data_file_path.split("\\")[-1].split(".")[0]
#data_file.drop(["Set","Prediction"], axis=1, inplace=True)
data_file.drop_duplicates(subset=["ID","Pos", "Strand"],inplace=True)
data_file.reset_index(inplace=True,drop=True)
mask=data_file.index[data_file.Strand=="-"].tolist()
data_file["Pos"][mask]=data_file["Pos"][mask]-1
data_file.drop(["Strand"], axis=1, inplace=True)
data_file.rename(columns={ "Score":name}, inplace=True)
data_file.reset_index(inplace=True,drop=True)
return(data_file)
def combine_methods(val):
dfs=[]
for v in val:
dfs.append(methods[v])
combine_file=reduce(lambda left,right: pd.merge(left, right, how='inner',on=["ID","Pos","Label"]), dfs)
combine_file.drop(["ID","Pos"], axis=1, inplace=True)
combine_file.Label[combine_file.Label=="no"]=0
combine_file.Label[combine_file.Label=="yes"]=1
return(combine_file[combine_file.columns[-(len(val)):]],combine_file.Label)
if __name__ == '__main__':
if options.deepsignalfilepath is not None:
deepsignal= mod_file(options.deepsignalfilepath)
methods={"deepsignal":deepsignal}
if options.nanopolishfilepath is not None:
nanopolish= mod_file(options.nanopolishfilepath)
methods.update({"nanopolish":nanopolish})
if options.guppyfilepath is not None:
guppy= mod_file(options.guppyfilepath)
methods.update({"guppy":guppy})
if options.megalodonfilepath is not None:
megalodon= mod_file(options.megalodonfilepath)
methods.update({"megalodon":megalodon})
if options.tombofilepath is not None:
tombo= mod_file(options.tombofilepath)
methods.update({"tombo":tombo})
comb = combinations(list(methods.keys()), options.combinations)
os.makedirs(options.output)
for val in list(comb):
X,y=combine_methods(val)
X=sklearn.preprocessing.MinMaxScaler().fit_transform(X)
y=y.astype(int)
y=y.to_numpy()
y=np.ravel(y)
rf = RandomForestClassifier(max_depth=3, n_estimators =10)
rf.fit(X, y)
filename = options.output+'/'+'rf_model_max_depth_3_n_estimator_10_'+'_'.join(val)+'.model'
joblib.dump(rf, open(filename, 'wb'))
comb = combinations(list(methods.keys()), options.combinations)
for val in list(comb):
X,y=combine_methods(val)
X=sklearn.preprocessing.MinMaxScaler().fit_transform(X)
y=y.astype(int)
y=y.to_numpy()
y=np.ravel(y)
rf = RandomForestClassifier()
rf.fit(X, y)
filename = options.output+'/'+'rf_model_default_'+'_'.join(val)+'.model'
joblib.dump(rf, open(filename, 'wb'),compress=3)