forked from OSU-SRLab/MANTIS
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelpers.py
130 lines (118 loc) · 4.36 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# @file helpers.py
# @author Esko Kautto ([email protected])
# @updated 2016-06-16
import time
"""
Since the functionality of .iteritems() has changed between
Python 2 and 3, this function attempts to bridge the gap and
provide similar functionality regardless of Python version.
"""
def iteritems(d):
if hasattr(d, 'iteritems'):
return d.iteritems()
else:
return list(d.items())
# end .iteritems()
"""
Returns the current time in a m/d/y H:M:S format.
"""
def timestamp():
return time.strftime('%m/%d/%y %H:%M:%S')
# end .timestamp()
"""
Prints the input string's line(s) with prepended timestamp(s).
"""
def tprint(string):
output = string.split('\t')
for line in output:
print('[{0}] {1}'.format(timestamp(), line))
# end .tprint()
"""
Checks to make sure the BED file has the expected 6-column
format. The fourth column is expected to be a formatted
kmer repeat unit and count string. The fifth and sixth
column are not currently utilized, but are required to
maintain the BED 6-col format.
e.g.
chr1 10357206 10357223 (T)17 0 +
"""
def check_bedfile_format(filepath):
with open(filepath, 'Ur') as f:
line_number = 0
for line in f:
line = line.strip()
line_number += 1
if len(line):
line = line.split('\t')
if len(line) != 6:
print('Error: MANTIS expects a 6-column BED file with' +
' the 4th column containing the kmer repeat' +
' sequence and count (e.g. (T)15 or (CAC)5 ).')
print('\nOffending line (line {0}) has:'.format(line_number))
for n, value in enumerate(line):
print('[{0}]\t{1}'.format(n, value))
return False
else:
# Check the k-mer column
kmer_format_ok = True
kmer = line[3]
if '(' in kmer and ')' in kmer:
kmer = kmer.split(')')
if '(' in kmer[0]:
kmer_unit = kmer[0].split('(')[1]
kmer_count = kmer[1]
if not kmer_count.isdigit():
kmer_format_ok = False
# Make sure only valid ATCG characters were used.
for c in list('ATCG'):
kmer_unit = kmer_unit.replace(c, '')
if len(kmer_unit) > 0:
kmer_format_ok = False
else:
kmer_format_ok = False
else:
kmer_format_ok = False
if not kmer_format_ok:
print('Error: MANTIS expects the kmer column in the' +
' BED file to follow a (XX)NN type format, with' +
' the repeat unit wrapped in parentheses,' +
' followed by the expected number of repeats.')
return False
return True
# end .check_bedfile_format()
"""
Determines how many times the k-mer repeats in a row
within the given sequence.
"""
def kmer_repeat_count(kmer, sequence, offset = 0):
repeats = 0
klen = len(kmer)
while True:
if sequence[offset:offset+klen] != kmer:
# End of repeat sequence
break
repeats += 1
offset += klen
if offset > len(sequence):
# End of sequence
break
return repeats
# end kmer_repeat_count()
# Checks to make sure required modules are present in environment
def required_modules_present(modules):
missing = []
for module in modules:
try:
__import__('imp').find_module(module.lower())
# Everything is fine; module is available
except ImportError:
# Module not found
missing.append(module)
if len(missing):
for module in modules:
print('Error: You must have {0} available in your environment!'.format(module))
print('Please check your $PYTHONPATH to make sure you have properly ' +
'included required moudles/libraries in it.')
exit(1)
return True
# end required_modules_present()