-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_ccle_mutation_prediction.py
235 lines (209 loc) · 10.6 KB
/
run_ccle_mutation_prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
"""
Script to run pan-cancer classification experiments for all chosen combinations
of gene and cancer type.
Output files are identified by {gene}_{cancer_type} (in this order).
"""
import sys
import argparse
import itertools as it
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
import pancancer_evaluation.config as cfg
from pancancer_evaluation.data_models.ccle_data_model import CCLEDataModel
from pancancer_evaluation.exceptions import (
NoTrainSamplesError,
NoTestSamplesError,
OneClassError,
ResultsFileExistsError
)
from pancancer_evaluation.utilities.classify_utilities import run_cv_cancer_type
import pancancer_evaluation.utilities.ccle_data_utilities as du
from pancancer_evaluation.utilities.data_utilities import (
load_custom_genes,
get_classification
)
import pancancer_evaluation.utilities.file_utilities as fu
def process_args():
p = argparse.ArgumentParser()
p.add_argument('--genes', nargs='*', default=None,
help='currently this needs to be a subset of top_50')
p.add_argument('--feature_selection',
choices=['mad', 'pancan_f_test', 'median_f_test', 'random'],
default='mad',
help='method to use for feature selection, only applied if '
'0 > num_features > total number of columns')
p.add_argument('--holdout_cancer_types', nargs='*', default=None,
help='provide a list of cancer types to hold out, uses all '
'cancer types in CCLE if none are provided')
p.add_argument('--log_file', default=None,
help='name of file to log skipped cancer types to')
p.add_argument('--mad_preselect', type=int, default=None,
help='if included, pre-select this many features by MAD, '
'before applying primary feature selection method. this '
'can help to speed up more complicated feature selection '
'approaches')
p.add_argument('--num_features', type=int, default=cfg.num_features_raw,
help='if included, subset gene features to this number of '
'features having highest mean absolute deviation')
p.add_argument('--num_folds', type=int, default=4,
help='number of folds of cross-validation to run')
p.add_argument('--results_dir', default=cfg.results_dir,
help='where to write results to')
p.add_argument('--ridge', action='store_true',
help='use ridge regression rather than default elastic net')
p.add_argument('--seed', type=int, default=cfg.default_seed)
p.add_argument('--training_samples',
choices=['single_cancer', 'pancancer', 'all_other_cancers'],
default='single_cancer',
help='set of samples to train model on')
p.add_argument('--verbose', action='store_true')
args = p.parse_args()
sample_info_df = du.load_sample_info(args.verbose)
ccle_cancer_types = du.get_cancer_types(sample_info_df)
if args.holdout_cancer_types is None:
args.holdout_cancer_types = ccle_cancer_types
else:
not_in_ccle = set(args.holdout_cancer_types) - set(ccle_cancer_types)
if len(not_in_ccle) > 0:
p.error('some cancer types not present in CCLE: {}'.format(
' '.join(not_in_ccle)))
args.results_dir = Path(args.results_dir).resolve()
if args.log_file is None:
args.log_file = Path(args.results_dir, 'log_skipped.tsv').resolve()
return args, sample_info_df
if __name__ == '__main__':
# process command line arguments
args, sample_info_df = process_args()
# create results dir if it doesn't exist
args.results_dir.mkdir(parents=True, exist_ok=True)
# create empty log file if it doesn't exist
log_columns = [
'gene',
'cancer_type',
'training_samples',
'shuffle_labels',
'skip_reason'
]
if args.log_file.exists() and args.log_file.is_file():
log_df = pd.read_csv(args.log_file, sep='\t')
else:
log_df = pd.DataFrame(columns=log_columns)
log_df.to_csv(args.log_file, sep='\t')
ccle_data = CCLEDataModel(sample_info=sample_info_df,
feature_selection=args.feature_selection,
num_features=args.num_features,
mad_preselect=args.mad_preselect,
seed=args.seed,
verbose=args.verbose)
genes_df = load_custom_genes(args.genes)
for shuffle_labels in (False, True):
print('training_samples: {}, shuffle_labels: {}'.format(
args.training_samples, shuffle_labels))
outer_progress = tqdm(genes_df.iterrows(),
total=genes_df.shape[0],
ncols=100,
file=sys.stdout)
for gene_idx, gene_series in outer_progress:
gene = gene_series.gene
classification = gene_series.classification
outer_progress.set_description('gene: {}'.format(gene))
try:
gene_dir = fu.make_gene_dir(args.results_dir,
gene,
dirname=args.training_samples)
# only add a cancer type covariate if we're training using pan-cancer data
is_pancancer = (args.training_samples == 'pancancer')
ccle_data.process_data_for_gene(
gene,
classification,
gene_dir,
add_cancertype_covariate=is_pancancer
)
except KeyError:
# this might happen if the given gene isn't in the mutation data
# (or has a different alias)
print('Gene {} not found in mutation data, skipping'.format(gene),
file=sys.stderr)
cancer_type_log_df = fu.generate_log_df(
log_columns,
[gene, args.training_samples, True, shuffle_labels, 'gene_not_found']
)
fu.write_log_file(cancer_type_log_df, args.log_file)
continue
inner_progress = tqdm(args.holdout_cancer_types,
ncols=100,
file=sys.stdout)
for cancer_type in inner_progress:
inner_progress.set_description('cancer type: {}'.format(cancer_type))
cancer_type_log_df = None
try:
check_file = fu.check_cancer_type_file(gene_dir,
gene,
cancer_type,
shuffle_labels,
args.seed,
args.feature_selection,
args.num_features)
# we're working with pretty small sample sizes for the cell
# line data, so we stratify by label across CV folds here
# to make sure proportions aren't too imbalanced
results = run_cv_cancer_type(ccle_data,
gene,
cancer_type,
sample_info_df,
args.num_folds,
args.training_samples,
shuffle_labels,
stratify_label=True,
ridge=args.ridge)
except ResultsFileExistsError:
if args.verbose:
print('Skipping because results file exists already: '
'gene {}, cancer type {}'.format(gene, cancer_type),
file=sys.stderr)
cancer_type_log_df = fu.generate_log_df(
log_columns,
[gene, cancer_type, args.training_samples, shuffle_labels, 'file_exists']
)
except NoTrainSamplesError:
if args.verbose:
print('Skipping due to no train samples: gene {}, '
'cancer type {}'.format(gene, cancer_type),
file=sys.stderr)
cancer_type_log_df = fu.generate_log_df(
log_columns,
[gene, cancer_type, args.training_samples, shuffle_labels, 'no_train_samples']
)
except NoTestSamplesError:
if args.verbose:
print('Skipping due to no test samples: gene {}, '
'cancer type {}'.format(gene, cancer_type),
file=sys.stderr)
cancer_type_log_df = fu.generate_log_df(
log_columns,
[gene, cancer_type, args.training_samples, shuffle_labels, 'no_test_samples']
)
except OneClassError:
if args.verbose:
print('Skipping due to one holdout class: gene {}, '
'cancer type {}'.format(gene, cancer_type),
file=sys.stderr)
cancer_type_log_df = fu.generate_log_df(
log_columns,
[gene, cancer_type, args.training_samples, shuffle_labels, 'one_class']
)
else:
# only save results if no exceptions
fu.save_results_cancer_type(gene_dir,
check_file,
results,
gene,
cancer_type,
shuffle_labels,
args.seed,
args.feature_selection,
args.num_features)
if cancer_type_log_df is not None:
fu.write_log_file(cancer_type_log_df, args.log_file)