-
Notifications
You must be signed in to change notification settings - Fork 1
/
genomic_regions_mark_overlaps.py
146 lines (128 loc) · 5.15 KB
/
genomic_regions_mark_overlaps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python
import sys, re, argparse
from argparse import RawTextHelpFormatter
###################################################
### 0. script description and parsing arguments ###
###################################################
synopsis1 = "\
accept coordinates of two genomic regions (chrID, start, and end) and mark\n\
regions in <target> that are included in any region in <query>."
synopsis2 = "detailed description:\n\
1. Input:\n\
- <query>: tab-delimited with an unique ID (uID), chrID, start, and end for\n\
each genomic region; genomic regions can overlap, but uID must be unique;\n\
header not required\n\
- <target>: tab-delimited with an chrID, start, and end for each region in\n\
columns <N>, <N+1>, and <N+2>, respectively; e.g. if the file starts with\n\
chrID, <N> == 1\n\
- recomended uID format for <query> is 'chrID:start-end__annotation'; uID is\n\
the only information appear in the output\n\
- in general, <query> should have the larger genomic regions than <target>;\n\
if the other way around, use '-r' option below\n\
2. Output and options:\n\
- <output> contains the entire line of <target> and uID, tab-delimited; if\n\
there is no region in <query> that includes the genomic region, '_na_'\n\
will be printed instead of uID\n\
- '-r'|'--reverse': look for regions in <query> INCLUDED within a region in\n\
<target>; if the target region includes multiple <query> regions, print\n\
the line in <target> multiple times with each uID; [False]\n\
- '-c'|'--counter': report progress per every X lines; [10000]\n\
by [email protected] 20211115 ver 1.2.1\n\n"
#version_history
#21211115 ver 1.2.1 modified to work with python 3
#20210314 script renamed to "genomic_regions_mark_overlaps.py"
#20201114 script renamed to "genomic_regions_mark_regions_included_in_others.py"
#20200127 ver 1.2 # '-r' option added
#20200125 ver 1.1 # minor modification to the output format
#20150419 ver 1.0
parser = argparse.ArgumentParser(description = synopsis1, epilog = synopsis2, formatter_class = RawTextHelpFormatter)
# positional parameters
parser.add_argument('query', type=str, help="See below")
parser.add_argument('N', type=int)
parser.add_argument('target', type=str, help="See below")
parser.add_argument('output', type=str, help="See below")
# options
parser.add_argument('-r', '--reverse', action="store_true", default=False)
parser.add_argument('-c', '--counter', type=int, default=10000)
args = parser.parse_args()
fin_query = open(args.query, "r")
chrID_colIndex = args.N
fin_target = open(args.target, "r")
fout = open(args.output, "w")
##########################################################
### 1. read <query> and <target> and write to <output> ###
##########################################################
### reading in <query>
lines_in_region_table = 0
faultyLines_in_region_table = 0
uID = ""
chrID = ""
start_q = 0
end_q = 0
regionNum = 0
uID_dict = dict()
chrID_dict = dict()
start_dict = dict()
end_dict = dict()
for line in fin_query:
tok = re.split('\t', line)
# tok = line.split('\t')
lines_in_region_table = lines_in_region_table + 1
try:
uID = tok[0].strip()
chrID = tok[1].strip()
start_q = int(tok[2].strip())
end_q = int(tok[3].strip())
if end_q >= start_q :
uID_dict[regionNum] = uID
chrID_dict[regionNum] = chrID
start_dict[regionNum] = start_q
end_dict[regionNum] = end_q
regionNum = regionNum + 1
else:
faultyLines_in_region_table = faultyLines_in_region_table +1
except (IndexError, ValueError) :
faultyLines_in_region_table = faultyLines_in_region_table +1
print( "Reading <query> file: %s \n" % fin_query.name )
print( "Out of total %d line in query, %d were rejected.\n" % (lines_in_region_table, faultyLines_in_region_table) )
print( "Now marking regions in <target> file: %s \n" % fin_target.name )
fin_query.close()
### reding lines in <target> and print to <output>
Lines = 0
collectedLines = 0
chrID = ""
start_t = 0
end_t = 0
counted = 0
for line in fin_target:
try :
tok = re.split('\t', line)
chrID = tok[chrID_colIndex - 1].strip()
start_t = int( tok[chrID_colIndex].strip() )
end_t = int( tok[chrID_colIndex + 1].strip() )
if args.reverse:
for i in range(0, regionNum) :
if (chrID_dict[i] == chrID) and (start_dict[i] >= start_t) and (end_dict[i] <= end_t) :
fout.write(line.strip() + '\t' + uID_dict[i] + '\n')
counted = 1
else:
for i in range(0, regionNum) :
if (chrID_dict[i] == chrID) and (start_dict[i] <= start_t) and (end_dict[i] >= end_t) :
fout.write(line.strip() + '\t' + uID_dict[i] + '\n')
counted = 1
if (counted == 1):
collectedLines = collectedLines + 1
counted = 0
else:
fout.write(line.strip() + '\t_na_\n') # if not included in (or including) any region in <query>
Lines = Lines + 1
if ( Lines % args.counter == 0):
sys.stdout.write("\r marked %d regions out of %d" % (collectedLines, Lines))
sys.stdout.flush()
except (ValueError, IndexError):
pass
print( "\n\nFor total ", Lines, " regions in ", sys.argv[3].strip(), ", ", collectedLines, " were marked." )
print( "Printing to ", sys.argv[4].strip(), ": \n" )
fin_target.close()
fout.close()
print( "done" )