-
Notifications
You must be signed in to change notification settings - Fork 2
/
detect_differential_isoforms.py
150 lines (124 loc) · 5.44 KB
/
detect_differential_isoforms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import argparse
import os
import os.path
import rmats_long_utils
def parse_args():
parser = argparse.ArgumentParser(
description=('Detect differential isoform expression using DRIMSeq'))
parser.add_argument(
'--abundance',
required=True,
help='The path to the abundance.esp file from ESPRESSO')
parser.add_argument('--out-dir',
required=True,
help='The path to use as the output directory')
parser.add_argument(
'--group-1',
required=True,
help=('The path to a file listing the sample names for group 1. The'
' file should have a single line with the sample names as a'
' comma separated list. The sample names should match with the'
' ESPRESSO abundance column names.'))
parser.add_argument(
'--group-2',
required=True,
help='The path to a file listing the sample names for group 2.')
parser.add_argument(
'--num-threads',
type=int,
default=1,
help='The number of threads to use (default: %(default)s)')
parser.add_argument(
'--adj-pvalue',
type=float,
default=0.05,
help='The cutoff for adjusted p-value (default: %(default)s)')
parser.add_argument('--use-unadjusted-pvalue',
action='store_true',
help='Use pvalue instead of adj_pvalue for the cutoff')
parser.add_argument(
'--delta-proportion',
type=float,
default=0.1,
help='The cutoff for delta isoform proportion (default: %(default)s)')
return parser.parse_args()
def sort_tsv_by_columns(tsv_path, column_names):
rows_with_sort_key = list()
with open(tsv_path, 'rt') as handle:
for line_i, line in enumerate(handle):
columns = line.rstrip('\n').split('\t')
if line_i == 0:
header_line = line
header = columns
column_indices = list()
for name in column_names:
column_indices.append(header.index(name))
continue
sort_key = list()
for column_i in column_indices:
sort_key.append(columns[column_i])
rows_with_sort_key.append((line, sort_key))
rows_with_sort_key.sort(key=lambda p: p[1])
with open(tsv_path, 'wt') as handle:
handle.write(header_line)
for row, key in rows_with_sort_key:
handle.write(row)
def detect_isoforms(script_dir, abundance_path, out_dir, num_threads,
group_1_path, group_2_path):
r_script_path = os.path.join(script_dir, 'detect_differential_isoforms.R')
command = [
'Rscript', r_script_path, abundance_path, out_dir,
str(num_threads), group_1_path, group_2_path
]
rmats_long_utils.run_command(command)
def sort_output(out_dir):
gene_tsv = os.path.join(out_dir, 'differential_genes.tsv')
transcript_tsv = os.path.join(out_dir, 'differential_transcripts.tsv')
sort_tsv_by_columns(gene_tsv, ['gene_id'])
sort_tsv_by_columns(transcript_tsv, ['gene_id', 'feature_id'])
def calculate_isoform_proportion(script_dir, out_dir, abundance_path,
group_1_path, group_2_path,
python_executable):
proportion_script_path = os.path.join(script_dir,
'calculate_isoform_proportion.py')
diff_transcripts = os.path.join(out_dir, 'differential_transcripts.tsv')
tmp_file = '{}.tmp'.format(diff_transcripts)
command = [
python_executable, proportion_script_path, '--diff-transcripts',
diff_transcripts, '--abundance', abundance_path, '--group-1',
group_1_path, '--group-2', group_2_path, '--tmp-file', tmp_file
]
rmats_long_utils.run_command(command)
def count_significant_isoforms(script_dir, out_dir, python_executable,
adj_pvalue, use_unadjusted_pvalue,
delta_proportion):
count_script_path = os.path.join(script_dir,
'count_significant_isoforms.py')
diff_transcripts = os.path.join(out_dir, 'differential_transcripts.tsv')
filtered_transcripts = os.path.join(
out_dir, 'differential_transcripts_filtered.tsv')
command = [
python_executable, count_script_path, '--diff-transcripts',
diff_transcripts, '--out-tsv', filtered_transcripts, '--adj-pvalue',
str(adj_pvalue), '--delta-proportion',
str(delta_proportion)
]
if use_unadjusted_pvalue:
command.append('--use-unadjusted-pvalue')
rmats_long_utils.run_command(command)
def detect_differential_isoforms(args):
script_dir = rmats_long_utils.get_script_dir()
python_executable = rmats_long_utils.get_python_executable()
detect_isoforms(script_dir, args.abundance, args.out_dir, args.num_threads,
args.group_1, args.group_2)
sort_output(args.out_dir)
calculate_isoform_proportion(script_dir, args.out_dir, args.abundance,
args.group_1, args.group_2, python_executable)
count_significant_isoforms(script_dir, args.out_dir, python_executable,
args.adj_pvalue, args.use_unadjusted_pvalue,
args.delta_proportion)
def main():
args = parse_args()
detect_differential_isoforms(args)
if __name__ == '__main__':
main()