-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathiso_name_tagging.py
executable file
·294 lines (249 loc) · 10 KB
/
iso_name_tagging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
#!/usr/bin/env python
# pylint: disable=line-too-long
"""Add intersecting feature(s) into a SAM file as a tag.
Build new names for the intersecting features from a BED file and add them as a
tag to alignments in a SAM file using the format
FEATURE_ID|5p-shift|3p-shift|CIGAR|MD. If either the BED or the SAM file is
empty, only the SAM file header is returned.
EXPECTED INPUT FILES
The BED file must be the output a bedtools intersect call with -a being a GFF3
file and -b a BAM file. If the GFF3 used in the bedtools intersect call has the
features start and end coordinates extended, the number of additional
nucleotides can be specified using the CLI option `--extension`. The SAM file
must contian only the reads that have an intersecting feature.
NAME CREATION and TAG ADDITION
For each alignment, the name of the the intersecting feature will follow the
format FEATURE_ID|5p-shift|3p-shift|CIGAR|MD. The CLI option `--id` specifies
the feature identifier to be used as FEATURE_ID from within the attributes
column in the BED file. The 5p-shift and the 3-p shift values are the
difference between the feature start and end coordinates and the alignment
start and end coordinates. If `--extension` is provided, the feature start
position are adjusted by adding the given value and subtracting it from the
end position. If both, the 5p-shift and the 3p-shift, are within the range
+/- extension + 1 the feature name is added to the alignment as the new tag
"YW". Multiple intersecting feature names are separated by a semi-colon.
EXAMPLES
Example 1
in BED record:
19 . miRNA 44377 44398 . + . ID=MIMAT0002849;Alias=MIMAT0002849;Name=hsa-miR-524-5p;Derives_from=MI0003160 19 44376 44398 13-1_1 1 + 22
in SAM record:
13-1_1 0 19 44377 1 11M3I11M * 0 0 CTACAAAGGGAGGTAGCACTTTCTC * HI:i:0 MD:Z:22 NH:i:1 NM:i:3 RG:Z:A1 YZ:Z:0
command:
iso_name_tagging.py -b BED -s SAM
new name:
hsa-miR-524-5p|0|0|11M3I11M|22
out SAM record:
13-1_1 0 19 44377 1 11M3I11M * 0 0 CTACAAAGGGAGGTAGCACTTTCTC * HI:i:0 MD:Z:22 NH:i:1 NM:i:3 RG:Z:A1 YZ:Z:0 YW:Z:hsa-miR-524-5p|0|0|11M3I11M|22
Example 2
in BED record:
19 . miRNA 5338 5359 . + . ID=MIMAT0005795;Alias=MIMAT0005795;Name=hsa-miR-1323;Derives_from=MI0003786 19 5337 5358 48-1_1 255 + 21
in SAM record:
48-1_1 0 19 5338 255 21M * 0 0 TCAAAACTGAGGGGCATTTTC * MD:Z:21 NH:i:1 NM:i:0
command:
iso_name_tagging.py -b BED -s SAM
new name:
""
out SAM record:
48-1_1 0 19 5338 255 21M * 0 0 TCAAAACTGAGGGGCATTTTC * MD:Z:21 NH:i:1 NM:i:0 YW:Z:
Example 3
in BED record:
19 . miRNA 5332 5365 . + . ID=MIMAT0005795;Alias=MIMAT0005795;Name=hsa-miR-1323;Derives_from=MI0003786 19 5337 5358 48-1_1 255 + 21
in SAM record:
48-1_1 0 19 5338 255 21M * 0 0 TCAAAACTGAGGGGCATTTTC * MD:Z:21 NH:i:1 NM:i:0
command:
iso_name_tagging.py -b BED -s SAM --extension 6
new name:
hsa-miR-1323|0|-1|21M|21
out SAM record:
48-1_1 0 19 5338 255 21M * 0 0 TCAAAACTGAGGGGCATTTTC * MD:Z:21 NH:i:1 NM:i:0 YW:Z:hsa-miR-1323|0|-1|21M|21
Example 4
in BED record:
19 . miRNA 44377 44398 . + . ID=MIMAT0002849;Alias=MIMAT0002849;Name=hsa-miR-524-5p;Derives_from=MI0003160 19 44376 44398 13-1_1 1 + 22
in SAM record:
13-1_1 0 19 44377 1 11M3I11M * 0 0 CTACAAAGGGAGGTAGCACTTTCTC * HI:i:0 MD:Z:22 NH:i:1 NM:i:3 RG:Z:A1 YZ:Z:0
command:
iso_name_tagging.py -b BED -s SAM --id id
new name:
MIMAT0002849|0|0|11M3I11M|22
out SAM record:
13-1_1 0 19 44377 1 11M3I11M * 0 0 CTACAAAGGGAGGTAGCACTTTCTC * HI:i:0 MD:Z:22 NH:i:1 NM:i:3 RG:Z:A1 YZ:Z:0 YW:Z:MIMAT0002849|0|0|11M3I11M|22
""" # noqa: E501
# pylint: enable=line-too-long
import argparse
from collections import defaultdict, namedtuple
from pathlib import Path
import sys
from typing import Dict, Optional
import pysam
def parse_arguments():
"""Command-line arguments parser."""
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"-v",
"--version",
action="version",
version="%(prog)s 1.0.0",
help="Show program's version number and exit",
)
parser.add_argument(
"-b",
"--bed",
help=(
"Path to the BED file. This file must be the output of "
" a bedtools intersect call with -a being a GFF3 file and"
" -b a BAM file."
),
type=Path,
required=True,
)
parser.add_argument(
"-s",
"--sam",
help="Path to the SAM input file.",
type=Path,
required=True,
)
parser.add_argument(
"-e",
"--extension",
help=(
"Number of nucleotides the start and end coordinates of the"
" annotated features had been extended. Default: %(default)d."
),
default=0,
type=int,
)
parser.add_argument(
"--id",
help=(
"ID used to identify the feature in the name that is added as tag."
" The ID must be in lowercase. Default: %(default)s."
),
default="name",
type=str,
)
return parser
def attributes_dictionary(attr: str) -> Dict[str, str]:
"""Create attributes dicctionary."""
pairs = attr.split(";")
if len(pairs[0].split("=")) == 2:
attr_dict = {p.split("=")[0].lower(): p.split("=")[1] for p in pairs}
else:
attr_dict = {
p.split('"')[0].strip().lower(): p.split('"')[1] for p in pairs
}
return attr_dict
def parse_intersect_output(
intersect_file: Path, ID: str = "name", extension: int = 0
) -> Optional[Dict[Optional[str], list]]:
"""Parse intersect BED file.
Given a BED file generated by intersecting a GFF file (-a) with a BAM file
(-b) using bedtools intersect, create a dictionary where the alignment
names are the keys. The values are lists containing the feature name,
start position, and end position. The id argument specifies the feature
name to use, and the extension argument adjusts the feature coordinates by
adding the given value and subtracts it from the end position. If the BED
file is empty, `None` is returned.
Args:
intersect_file:
Path to the intersect BED file.
id:
ID used to identify the feature. Defaults to "name".
extension:
Number of nucleotides the start and end coordinates have to be
adjusted. Defaults to 0.
"""
intersect_data = defaultdict(list)
Fields = namedtuple(
"Fields",
(
"feat_chr",
"source",
"feat_type",
"feat_start",
"feat_end",
"feat_score",
"strand",
"phase",
"feat_attributes",
"read_chr",
"read_start",
"read_end",
"read_name",
"read_score",
"read_strand",
"overlap_len",
),
)
with open(intersect_file, "r", encoding="utf-8") as bedfile:
for line in bedfile:
fields = Fields(*line.strip().split("\t"))
miRNA_name = attributes_dictionary(fields.feat_attributes)[ID]
miRNA_start = int(fields.feat_start) + extension
miRNA_end = int(fields.feat_end) - extension
intersect_data[fields.read_name].append(
(miRNA_name, miRNA_start, miRNA_end)
)
if not intersect_data:
return None
return intersect_data
def get_tags(
intersecting_mirna: list, alignment: pysam.AlignedSegment, extension: int
) -> set:
"""Get tag for alignment.
Given an alignment and a list containing the feature name, start position,
and end position, create a list of strings to be added as a new tag to that
alignment. The string has the format:
feature-id|5p-shift|3p-shift|CIGAR|MD. The 5p-shift and 3p-shift are
calculated as a difference between the feature start/end position and the
alignment start/end position. If the start and end position of the
alignment differs at most by the extension argument value to the feature
start and end positions respectively, the name will be add to the final
list.
Args:
intersecting_mirna:
list with the feature name, start and end positions
alignment:
alignment to create the tag for
extension:
maximum number of nucleotides the alignment start and end positions
can differ from the feature to count it as an intersecting feature
Returns:
tags:
set of strings containing the new tag
"""
cigar = alignment.cigarstring
md = alignment.get_tag("MD")
limit = extension + 1
tags = []
for miRNA_name, miRNA_start, miRNA_end in intersecting_mirna:
shift_5p = alignment.reference_start - miRNA_start + 1
shift_3p = alignment.reference_end - miRNA_end
if -limit < shift_5p < limit and -limit < shift_3p < limit:
tags.append(f"{miRNA_name}|{shift_5p}|{shift_3p}|{cigar}|{md}")
return set(tags)
def main(arguments) -> None:
"""Add intersecting feature(s) into a SAM file as a tag."""
intersect_data = parse_intersect_output(
arguments.bed, arguments.id, arguments.extension
)
with pysam.AlignmentFile(arguments.sam, "r") as samfile:
sys.stdout.write(str(samfile.header))
if intersect_data is None:
return
for alignment in samfile:
alignment_id = alignment.query_name
intersecting_miRNAs = intersect_data[alignment_id]
tags = get_tags(
intersecting_mirna=intersecting_miRNAs,
alignment=alignment,
extension=arguments.extension,
)
alignment.set_tag("YW", ";".join(tags))
sys.stdout.write(alignment.to_string() + "\n")
if __name__ == "__main__":
args = parse_arguments().parse_args() # pragma: no cover
main(args) # pragma: no cover